summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/affs/namei.c5
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/attr.c7
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/bfs/dir.c3
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/buffer.c1
-rw-r--r--fs/cifs/cifsacl.c3
-rw-r--r--fs/coda/dir.c5
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/ecryptfs/crypto.c74
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h26
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c286
-rw-r--r--fs/ecryptfs/main.c84
-rw-r--r--fs/ecryptfs/super.c16
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/fat/namei_msdos.c5
-rw-r--r--fs/fat/namei_vfat.c5
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/hfs/dir.c6
-rw-r--r--fs/hfsplus/dir.c8
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/namei.c5
-rw-r--r--fs/inode.c54
-rw-r--r--fs/jffs2/dir.c5
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/namei.c5
-rw-r--r--fs/logfs/dir.c5
-rw-r--r--fs/minix/namei.c5
-rw-r--r--fs/namei.c44
-rw-r--r--fs/ncpfs/dir.c15
-rw-r--r--fs/nfs/Kconfig10
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.h17
-rw-r--r--fs/nfs/callback_proc.c51
-rw-r--r--fs/nfs/callback_xdr.c96
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/delegation.c14
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/inode.c11
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4filelayout.c38
-rw-r--r--fs/nfs/nfs4filelayout.h8
-rw-r--r--fs/nfs/nfs4filelayoutdev.c119
-rw-r--r--fs/nfs/nfs4proc.c107
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c132
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/objlayout/Kbuild5
-rw-r--r--fs/nfs/objlayout/objio_osd.c1057
-rw-r--r--fs/nfs/objlayout/objlayout.c712
-rw-r--r--fs/nfs/objlayout/objlayout.h187
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c412
-rw-r--r--fs/nfs/pagelist.c62
-rw-r--r--fs/nfs/pnfs.c342
-rw-r--r--fs/nfs/pnfs.h117
-rw-r--r--fs/nfs/pnfs_dev.c270
-rw-r--r--fs/nfs/read.c9
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/write.c10
-rw-r--r--fs/nfsd/export.c6
-rw-r--r--fs/nfsd/nfs3proc.c2
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nfsd/nfs4proc.c73
-rw-r--r--fs/nfsd/nfs4state.c42
-rw-r--r--fs/nfsd/nfs4xdr.c11
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/vfs.c33
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/omfs/dir.c11
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/reiserfs/namei.c5
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/squashfs/export.c2
-rw-r--r--fs/squashfs/fragment.c2
-rw-r--r--fs/squashfs/id.c2
-rw-r--r--fs/squashfs/super.c6
-rw-r--r--fs/sysv/namei.c5
-rw-r--r--fs/ubifs/dir.c5
-rw-r--r--fs/ubifs/io.c2
-rw-r--r--fs/ubifs/journal.c1
-rw-r--r--fs/ubifs/orphan.c2
-rw-r--r--fs/ubifs/recovery.c164
-rw-r--r--fs/ubifs/replay.c3
-rw-r--r--fs/ubifs/shrinker.c9
-rw-r--r--fs/ubifs/super.c44
-rw-r--r--fs/ubifs/tnc.c9
-rw-r--r--fs/ubifs/ubifs.h6
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/xattr.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
105 files changed, 4131 insertions, 936 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8d7f3e69ae2..7f6c6770319 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,7 +814,6 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
{
- dentry_unhash(d);
return v9fs_remove(i, d, 1);
}
@@ -840,9 +839,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct p9_fid *newdirfid;
struct p9_wstat wstat;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
P9_DPRINTK(P9_DEBUG_VFS, "\n");
retval = 0;
old_inode = old_dentry->d_inode;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 03330e2e390..e3e9efc1fdd 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,8 +320,6 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
dentry->d_inode->i_ino,
(int)dentry->d_name.len, dentry->d_name.name);
- dentry_unhash(dentry);
-
return affs_remove_header(dentry);
}
@@ -419,9 +417,6 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh = NULL;
int retval;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
(u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
(u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2c4e0516004..20c106f2492 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,8 +845,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
_enter("{%x:%u},{%s}",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
- dentry_unhash(dentry);
-
ret = -ENAMETOOLONG;
if (dentry->d_name.len >= AFSNAMEMAX)
goto error;
@@ -1148,9 +1146,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct key *key;
int ret;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
vnode = AFS_FS_I(old_dentry->d_inode);
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/attr.c b/fs/attr.c
index 91dbe2a107f..caf2aa521e2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -175,6 +175,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
return -EPERM;
}
+ if ((ia_valid & ATTR_MODE)) {
+ mode_t amode = attr->ia_mode;
+ /* Flag setting protected by i_mutex */
+ if (is_sxid(amode))
+ inode->i_flags &= ~S_NOSEC;
+ }
+
now = current_fs_time(inode->i_sb);
attr->ia_ctime = now;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 87d95a8cddb..f55ae23b137 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EACCES;
- dentry_unhash(dentry);
-
if (atomic_dec_and_test(&ino->count)) {
p_ino = autofs4_dentry_ino(dentry->d_parent);
if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index c7d1d06b048..b14cebfd904 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,9 +224,6 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct bfs_sb_info *info;
int error = -ENOENT;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_bh = new_bh = NULL;
old_inode = old_dentry->d_inode;
if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 332323e19dd..6c093fa98f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2524,7 +2524,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode);
+void btrfs_dirty_inode(struct inode *inode, int flags);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb51bb1fa44..39a9d5750ef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4294,7 +4294,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
* FIXME, needs more benchmarking...there are no reasons other than performance
* to keep or drop this code.
*/
-void btrfs_dirty_inode(struct inode *inode)
+void btrfs_dirty_inode(struct inode *inode, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
diff --git a/fs/buffer.c b/fs/buffer.c
index 698c6b2cc46..49c9aada037 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2382,6 +2382,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
ret = -EAGAIN;
goto out_unlock;
}
+ wait_on_page_writeback(page);
return 0;
out_unlock:
unlock_page(page);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 8f1700623b4..21de1d6d584 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -74,8 +74,9 @@ shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
* Run idmap cache shrinker.
*/
static int
-cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
{
+ int nr_to_scan = sc->nr_to_scan;
int nr_del = 0;
int nr_rem = 0;
struct rb_root *root;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a46126fd573..2b8dae4d121 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
int len = de->d_name.len;
int error;
- dentry_unhash(de);
-
error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
if (!error) {
/* VFS may delete the child */
@@ -361,9 +359,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
int new_length = new_dentry->d_name.len;
int error;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
coda_i2f(new_dir), old_length, new_length,
(const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9d17d350abc..9a37a9b6de3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,8 +1359,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
struct module *subsys_owner = NULL, *dead_item_owner = NULL;
int ret;
- dentry_unhash(dentry);
-
if (dentry->d_parent == configfs_sb->s_root)
return -EPERM;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b8d5c809102..58609bde3b9 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1024,25 +1024,25 @@ out:
}
/**
- * contains_ecryptfs_marker - check for the ecryptfs marker
+ * ecryptfs_validate_marker - check for the ecryptfs marker
* @data: The data block in which to check
*
- * Returns one if marker found; zero if not found
+ * Returns zero if marker found; -EINVAL if not found
*/
-static int contains_ecryptfs_marker(char *data)
+static int ecryptfs_validate_marker(char *data)
{
u32 m_1, m_2;
m_1 = get_unaligned_be32(data);
m_2 = get_unaligned_be32(data + 4);
if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
- return 1;
+ return 0;
ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
"MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
MAGIC_ECRYPTFS_MARKER);
ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
"[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
- return 0;
+ return -EINVAL;
}
struct ecryptfs_flag_map_elem {
@@ -1201,27 +1201,19 @@ int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code)
return rc;
}
-int ecryptfs_read_and_validate_header_region(char *data,
- struct inode *ecryptfs_inode)
+int ecryptfs_read_and_validate_header_region(struct inode *inode)
{
- struct ecryptfs_crypt_stat *crypt_stat =
- &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
+ u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+ u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
int rc;
- if (crypt_stat->extent_size == 0)
- crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
- rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
- ecryptfs_inode);
- if (rc < 0) {
- printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
- __func__, rc);
- goto out;
- }
- if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
- rc = -EINVAL;
- } else
- rc = 0;
-out:
+ rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
+ inode);
+ if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+ return rc >= 0 ? -EINVAL : rc;
+ rc = ecryptfs_validate_marker(marker);
+ if (!rc)
+ ecryptfs_i_size_init(file_size, inode);
return rc;
}
@@ -1242,8 +1234,7 @@ ecryptfs_write_header_metadata(char *virt,
(*written) = 6;
}
-struct kmem_cache *ecryptfs_header_cache_1;
-struct kmem_cache *ecryptfs_header_cache_2;
+struct kmem_cache *ecryptfs_header_cache;
/**
* ecryptfs_write_headers_virt
@@ -1496,11 +1487,9 @@ static int ecryptfs_read_headers_virt(char *page_virt,
crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
offset = ECRYPTFS_FILE_SIZE_BYTES;
- rc = contains_ecryptfs_marker(page_virt + offset);
- if (rc == 0) {
- rc = -EINVAL;
+ rc = ecryptfs_validate_marker(page_virt + offset);
+ if (rc)
goto out;
- }
if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
@@ -1567,20 +1556,21 @@ out:
return rc;
}
-int ecryptfs_read_and_validate_xattr_region(char *page_virt,
- struct dentry *ecryptfs_dentry)
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
+ struct inode *inode)
{
+ u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+ u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
int rc;
- rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_dentry->d_inode);
- if (rc)
- goto out;
- if (!contains_ecryptfs_marker(page_virt + ECRYPTFS_FILE_SIZE_BYTES)) {
- printk(KERN_WARNING "Valid data found in [%s] xattr, but "
- "the marker is invalid\n", ECRYPTFS_XATTR_NAME);
- rc = -EINVAL;
- }
-out:
+ rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
+ ECRYPTFS_XATTR_NAME, file_size,
+ ECRYPTFS_SIZE_AND_MARKER_BYTES);
+ if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+ return rc >= 0 ? -EINVAL : rc;
+ rc = ecryptfs_validate_marker(marker);
+ if (!rc)
+ ecryptfs_i_size_init(file_size, inode);
return rc;
}
@@ -1610,7 +1600,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
mount_crypt_stat);
/* Read the first page from the underlying file */
- page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, GFP_USER);
+ page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
if (!page_virt) {
rc = -ENOMEM;
printk(KERN_ERR "%s: Unable to allocate page_virt\n",
@@ -1655,7 +1645,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
out:
if (page_virt) {
memset(page_virt, 0, PAGE_CACHE_SIZE);
- kmem_cache_free(ecryptfs_header_cache_1, page_virt);
+ kmem_cache_free(ecryptfs_header_cache, page_virt);
}
return rc;
}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e70282775e2..43c7c43b06f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -200,6 +200,8 @@ ecryptfs_get_key_payload_data(struct key *key)
#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */
#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64))
+#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \
+ + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
#define ECRYPTFS_DEFAULT_CIPHER "aes"
#define ECRYPTFS_DEFAULT_KEY_BYTES 16
#define ECRYPTFS_DEFAULT_HASH "md5"
@@ -603,8 +605,7 @@ extern struct kmem_cache *ecryptfs_file_info_cache;
extern struct kmem_cache *ecryptfs_dentry_info_cache;
extern struct kmem_cache *ecryptfs_inode_info_cache;
extern struct kmem_cache *ecryptfs_sb_info_cache;
-extern struct kmem_cache *ecryptfs_header_cache_1;
-extern struct kmem_cache *ecryptfs_header_cache_2;
+extern struct kmem_cache *ecryptfs_header_cache;
extern struct kmem_cache *ecryptfs_xattr_cache;
extern struct kmem_cache *ecryptfs_key_record_cache;
extern struct kmem_cache *ecryptfs_key_sig_cache;
@@ -625,14 +626,9 @@ struct ecryptfs_open_req {
struct list_head kthread_ctl_list;
};
-#define ECRYPTFS_INTERPOSE_FLAG_D_ADD 0x00000001
-int ecryptfs_interpose(struct dentry *hidden_dentry,
- struct dentry *this_dentry, struct super_block *sb,
- u32 flags);
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+ struct super_block *sb);
void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
-int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
- struct dentry *lower_dentry,
- struct inode *ecryptfs_dir_inode);
int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
size_t *decrypted_name_size,
struct dentry *ecryptfs_dentry,
@@ -664,10 +660,9 @@ int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
void ecryptfs_write_crypt_stat_flags(char *page_virt,
struct ecryptfs_crypt_stat *crypt_stat,
size_t *written);
-int ecryptfs_read_and_validate_header_region(char *data,
- struct inode *ecryptfs_inode);
-int ecryptfs_read_and_validate_xattr_region(char *page_virt,
- struct dentry *ecryptfs_dentry);
+int ecryptfs_read_and_validate_header_region(struct inode *inode);
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
+ struct inode *inode);
u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
@@ -679,9 +674,6 @@ int
ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
unsigned char *src, struct dentry *ecryptfs_dentry);
int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
-int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
-int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
-void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
ssize_t
ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
void *value, size_t size);
@@ -761,7 +753,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
struct dentry *lower_dentry,
struct vfsmount *lower_mnt,
const struct cred *cred);
-int ecryptfs_get_lower_file(struct dentry *ecryptfs_dentry);
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode);
void ecryptfs_put_lower_file(struct inode *inode);
int
ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 566e5472f78..4ec9eb00a24 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -191,7 +191,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
| ECRYPTFS_ENCRYPTED);
}
mutex_unlock(&crypt_stat->cs_mutex);
- rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+ rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bc116b9ffcf..7349ade17de 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -51,6 +51,97 @@ static void unlock_dir(struct dentry *dir)
dput(dir);
}
+static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
+{
+ if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
+ return 1;
+ return 0;
+}
+
+static int ecryptfs_inode_set(struct inode *inode, void *opaque)
+{
+ struct inode *lower_inode = opaque;
+
+ ecryptfs_set_inode_lower(inode, lower_inode);
+ fsstack_copy_attr_all(inode, lower_inode);
+ /* i_size will be overwritten for encrypted regular files */
+ fsstack_copy_inode_size(inode, lower_inode);
+ inode->i_ino = lower_inode->i_ino;
+ inode->i_version++;
+ inode->i_mapping->a_ops = &ecryptfs_aops;
+
+ if (S_ISLNK(inode->i_mode))
+ inode->i_op = &ecryptfs_symlink_iops;
+ else if (S_ISDIR(inode->i_mode))
+ inode->i_op = &ecryptfs_dir_iops;
+ else
+ inode->i_op = &ecryptfs_main_iops;
+
+ if (S_ISDIR(inode->i_mode))
+ inode->i_fop = &ecryptfs_dir_fops;
+ else if (special_file(inode->i_mode))
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ else
+ inode->i_fop = &ecryptfs_main_fops;
+
+ return 0;
+}
+
+static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
+ struct super_block *sb)
+{
+ struct inode *inode;
+
+ if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
+ return ERR_PTR(-EXDEV);
+ if (!igrab(lower_inode))
+ return ERR_PTR(-ESTALE);
+ inode = iget5_locked(sb, (unsigned long)lower_inode,
+ ecryptfs_inode_test, ecryptfs_inode_set,
+ lower_inode);
+ if (!inode) {
+ iput(lower_inode);
+ return ERR_PTR(-EACCES);
+ }
+ if (!(inode->i_state & I_NEW))
+ iput(lower_inode);
+
+ return inode;
+}
+
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+ struct super_block *sb)
+{
+ struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
+
+ if (!IS_ERR(inode) && (inode->i_state & I_NEW))
+ unlock_new_inode(inode);
+
+ return inode;
+}
+
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_interpose(struct dentry *lower_dentry,
+ struct dentry *dentry, struct super_block *sb)
+{
+ struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb);
+
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ d_instantiate(dentry, inode);
+
+ return 0;
+}
+
/**
* ecryptfs_create_underlying_file
* @lower_dir_inode: inode of the parent in the lower fs of the new file
@@ -129,7 +220,7 @@ ecryptfs_do_create(struct inode *directory_inode,
goto out_lock;
}
rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
- directory_inode->i_sb, 0);
+ directory_inode->i_sb);
if (rc) {
ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
goto out_lock;
@@ -168,7 +259,8 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
"context; rc = [%d]\n", rc);
goto out;
}
- rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+ rc = ecryptfs_get_lower_file(ecryptfs_dentry,
+ ecryptfs_dentry->d_inode);
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
@@ -215,102 +307,90 @@ out:
return rc;
}
+static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
+{
+ struct ecryptfs_crypt_stat *crypt_stat;
+ int rc;
+
+ rc = ecryptfs_get_lower_file(dentry, inode);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the lower file for the dentry with name "
+ "[%s]; rc = [%d]\n", __func__,
+ dentry->d_name.name, rc);
+ return rc;
+ }
+
+ crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+ /* TODO: lock for crypt_stat comparison */
+ if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+ ecryptfs_set_default_sizes(crypt_stat);
+
+ rc = ecryptfs_read_and_validate_header_region(inode);
+ ecryptfs_put_lower_file(inode);
+ if (rc) {
+ rc = ecryptfs_read_and_validate_xattr_region(dentry, inode);
+ if (!rc)
+ crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+ }
+
+ /* Must return 0 to allow non-eCryptfs files to be looked up, too */
+ return 0;
+}
+
/**
- * ecryptfs_lookup_and_interpose_lower - Perform a lookup
+ * ecryptfs_lookup_interpose - Dentry interposition for a lookup
*/
-int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
- struct dentry *lower_dentry,
- struct inode *ecryptfs_dir_inode)
+static int ecryptfs_lookup_interpose(struct dentry *dentry,
+ struct dentry *lower_dentry,
+ struct inode *dir_inode)
{
- struct dentry *lower_dir_dentry;
+ struct inode *inode, *lower_inode = lower_dentry->d_inode;
+ struct ecryptfs_dentry_info *dentry_info;
struct vfsmount *lower_mnt;
- struct inode *lower_inode;
- struct ecryptfs_crypt_stat *crypt_stat;
- char *page_virt = NULL;
- int put_lower = 0, rc = 0;
-
- lower_dir_dentry = lower_dentry->d_parent;
- lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
- ecryptfs_dentry->d_parent));
- lower_inode = lower_dentry->d_inode;
- fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
+ int rc = 0;
+
+ lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+ fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
BUG_ON(!lower_dentry->d_count);
- ecryptfs_set_dentry_private(ecryptfs_dentry,
- kmem_cache_alloc(ecryptfs_dentry_info_cache,
- GFP_KERNEL));
- if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
- rc = -ENOMEM;
+
+ dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+ ecryptfs_set_dentry_private(dentry, dentry_info);
+ if (!dentry_info) {
printk(KERN_ERR "%s: Out of memory whilst attempting "
"to allocate ecryptfs_dentry_info struct\n",
__func__);
- goto out_put;
+ dput(lower_dentry);
+ mntput(lower_mnt);
+ d_drop(dentry);
+ return -ENOMEM;
}
- ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
- ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
+ ecryptfs_set_dentry_lower(dentry, lower_dentry);
+ ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+
if (!lower_dentry->d_inode) {
/* We want to add because we couldn't find in lower */
- d_add(ecryptfs_dentry, NULL);
- goto out;
- }
- rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
- ecryptfs_dir_inode->i_sb,
- ECRYPTFS_INTERPOSE_FLAG_D_ADD);
- if (rc) {
- printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
- __func__, rc);
- goto out;
- }
- if (S_ISDIR(lower_inode->i_mode))
- goto out;
- if (S_ISLNK(lower_inode->i_mode))
- goto out;
- if (special_file(lower_inode->i_mode))
- goto out;
- /* Released in this function */
- page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
- if (!page_virt) {
- printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
- __func__);
- rc = -ENOMEM;
- goto out;
+ d_add(dentry, NULL);
+ return 0;
}
- rc = ecryptfs_get_lower_file(ecryptfs_dentry);
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to initialize "
- "the lower file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- ecryptfs_dentry->d_name.name, rc);
- goto out_free_kmem;
+ inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb);
+ if (IS_ERR(inode)) {
+ printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n",
+ __func__, PTR_ERR(inode));
+ return PTR_ERR(inode);
}
- put_lower = 1;
- crypt_stat = &ecryptfs_inode_to_private(
- ecryptfs_dentry->d_inode)->crypt_stat;
- /* TODO: lock for crypt_stat comparison */
- if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
- ecryptfs_set_default_sizes(crypt_stat);
- rc = ecryptfs_read_and_validate_header_region(page_virt,
- ecryptfs_dentry->d_inode);
- if (rc) {
- memset(page_virt, 0, PAGE_CACHE_SIZE);
- rc = ecryptfs_read_and_validate_xattr_region(page_virt,
- ecryptfs_dentry);
+ if (S_ISREG(inode->i_mode)) {
+ rc = ecryptfs_i_size_read(dentry, inode);
if (rc) {
- rc = 0;
- goto out_free_kmem;
+ make_bad_inode(inode);
+ return rc;
}
- crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
}
- ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
-out_free_kmem:
- kmem_cache_free(ecryptfs_header_cache_2, page_virt);
- goto out;
-out_put:
- dput(lower_dentry);
- mntput(lower_mnt);
- d_drop(ecryptfs_dentry);
-out:
- if (put_lower)
- ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
+
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
+ d_add(dentry, inode);
+
return rc;
}
@@ -353,12 +433,12 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
goto out_d_drop;
}
if (lower_dentry->d_inode)
- goto lookup_and_interpose;
+ goto interpose;
mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
if (!(mount_crypt_stat
&& (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
- goto lookup_and_interpose;
+ goto interpose;
dput(lower_dentry);
rc = ecryptfs_encrypt_and_encode_filename(
&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
@@ -381,9 +461,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
encrypted_and_encoded_name);
goto out_d_drop;
}
-lookup_and_interpose:
- rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
- ecryptfs_dir_inode);
+interpose:
+ rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
+ ecryptfs_dir_inode);
goto out;
out_d_drop:
d_drop(ecryptfs_dentry);
@@ -411,7 +491,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
lower_new_dentry);
if (rc || !lower_new_dentry->d_inode)
goto out_lock;
- rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
+ rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
if (rc)
goto out_lock;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -478,7 +558,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
kfree(encoded_symname);
if (rc || !lower_dentry->d_inode)
goto out_lock;
- rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out_lock;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -502,7 +582,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
if (rc || !lower_dentry->d_inode)
goto out;
- rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -521,8 +601,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
struct dentry *lower_dir_dentry;
int rc;
- dentry_unhash(dentry);
-
lower_dentry = ecryptfs_dentry_to_lower(dentry);
dget(dentry);
lower_dir_dentry = lock_parent(lower_dentry);
@@ -552,7 +630,7 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
if (rc || !lower_dentry->d_inode)
goto out;
- rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -575,9 +653,6 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *lower_new_dir_dentry;
struct dentry *trap = NULL;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
dget(lower_old_dentry);
@@ -755,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
lower_ia->ia_valid &= ~ATTR_SIZE;
return 0;
}
- rc = ecryptfs_get_lower_file(dentry);
+ rc = ecryptfs_get_lower_file(dentry, inode);
if (rc)
return rc;
crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
@@ -911,7 +986,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
mount_crypt_stat = &ecryptfs_superblock_to_private(
dentry->d_sb)->mount_crypt_stat;
- rc = ecryptfs_get_lower_file(dentry);
+ rc = ecryptfs_get_lower_file(dentry, inode);
if (rc) {
mutex_unlock(&crypt_stat->cs_mutex);
goto out;
@@ -1084,21 +1159,6 @@ out:
return rc;
}
-int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
-{
- if ((ecryptfs_inode_to_lower(inode)
- == (struct inode *)candidate_lower_inode))
- return 1;
- else
- return 0;
-}
-
-int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
-{
- ecryptfs_init_inode(inode, (struct inode *)lower_inode);
- return 0;
-}
-
const struct inode_operations ecryptfs_symlink_iops = {
.readlink = ecryptfs_readlink,
.follow_link = ecryptfs_follow_link,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 89b93389af8..9f1bb747d77 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -135,12 +135,12 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
return rc;
}
-int ecryptfs_get_lower_file(struct dentry *dentry)
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode)
{
- struct ecryptfs_inode_info *inode_info =
- ecryptfs_inode_to_private(dentry->d_inode);
+ struct ecryptfs_inode_info *inode_info;
int count, rc = 0;
+ inode_info = ecryptfs_inode_to_private(inode);
mutex_lock(&inode_info->lower_file_mutex);
count = atomic_inc_return(&inode_info->lower_file_count);
if (WARN_ON_ONCE(count < 1))
@@ -168,75 +168,6 @@ void ecryptfs_put_lower_file(struct inode *inode)
}
}
-static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
- struct super_block *sb)
-{
- struct inode *inode;
- int rc = 0;
-
- if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
- rc = -EXDEV;
- goto out;
- }
- if (!igrab(lower_inode)) {
- rc = -ESTALE;
- goto out;
- }
- inode = iget5_locked(sb, (unsigned long)lower_inode,
- ecryptfs_inode_test, ecryptfs_inode_set,
- lower_inode);
- if (!inode) {
- rc = -EACCES;
- iput(lower_inode);
- goto out;
- }
- if (inode->i_state & I_NEW)
- unlock_new_inode(inode);
- else
- iput(lower_inode);
- if (S_ISLNK(lower_inode->i_mode))
- inode->i_op = &ecryptfs_symlink_iops;
- else if (S_ISDIR(lower_inode->i_mode))
- inode->i_op = &ecryptfs_dir_iops;
- if (S_ISDIR(lower_inode->i_mode))
- inode->i_fop = &ecryptfs_dir_fops;
- if (special_file(lower_inode->i_mode))
- init_special_inode(inode, lower_inode->i_mode,
- lower_inode->i_rdev);
- fsstack_copy_attr_all(inode, lower_inode);
- /* This size will be overwritten for real files w/ headers and
- * other metadata */
- fsstack_copy_inode_size(inode, lower_inode);
- return inode;
-out:
- return ERR_PTR(rc);
-}
-
-/**
- * ecryptfs_interpose
- * @lower_dentry: Existing dentry in the lower filesystem
- * @dentry: ecryptfs' dentry
- * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
- *
- * Interposes upper and lower dentries.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
- struct super_block *sb, u32 flags)
-{
- struct inode *lower_inode = lower_dentry->d_inode;
- struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
- d_add(dentry, inode);
- else
- d_instantiate(dentry, inode);
- return 0;
-}
-
enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
ecryptfs_opt_ecryptfs_key_bytes,
@@ -704,13 +635,8 @@ static struct ecryptfs_cache_info {
.size = sizeof(struct ecryptfs_sb_info),
},
{
- .cache = &ecryptfs_header_cache_1,
- .name = "ecryptfs_headers_1",
- .size = PAGE_CACHE_SIZE,
- },
- {
- .cache = &ecryptfs_header_cache_2,
- .name = "ecryptfs_headers_2",
+ .cache = &ecryptfs_header_cache,
+ .name = "ecryptfs_headers",
.size = PAGE_CACHE_SIZE,
},
{
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 245b517bf1b..dbd52d40df4 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -93,22 +93,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
}
/**
- * ecryptfs_init_inode
- * @inode: The ecryptfs inode
- *
- * Set up the ecryptfs inode.
- */
-void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
-{
- ecryptfs_set_inode_lower(inode, lower_inode);
- inode->i_ino = lower_inode->i_ino;
- inode->i_version++;
- inode->i_op = &ecryptfs_main_iops;
- inode->i_fop = &ecryptfs_main_fops;
- inode->i_mapping->a_ops = &ecryptfs_aops;
-}
-
-/**
* ecryptfs_statfs
* @sb: The ecryptfs super block
* @buf: The struct kstatfs to fill in with stats
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 68b2e43d7c3..3451d23c3ba 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3392,7 +3392,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext3_dirty_inode(struct inode *inode)
+void ext3_dirty_inode(struct inode *inode, int flags)
{
handle_t *current_handle = ext3_journal_current_handle();
handle_t *handle;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a74b89c09f9..1921392cd70 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1813,7 +1813,7 @@ extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int ext4_sync_inode(handle_t *, struct inode *);
-extern void ext4_dirty_inode(struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 50d0e9c6458..a5763e3505b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5733,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index be15437c272..3b222dafd15 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,8 +326,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
struct fat_slot_info sinfo;
int err;
- dentry_unhash(dentry);
-
lock_super(sb);
/*
* Check whether the directory is not in use, then check
@@ -459,9 +457,6 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
err = fat_scan(old_dir, old_name, &old_sinfo);
if (err) {
err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c61a6789f36..20b4ea53fdc 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,8 +824,6 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
struct fat_slot_info sinfo;
int err;
- dentry_unhash(dentry);
-
lock_super(sb);
err = fat_dir_empty(inode);
@@ -933,9 +931,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
int err, is_dir, update_dotdot, corrupt = 0;
struct super_block *sb = old_dir->i_sb;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 34591ee804b..0f015a0468d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1007,9 +1007,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
* In short, make sure you hash any inodes _before_ you start marking
* them dirty.
*
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
* the kernel-internal blockdev inode represents the dirtying time of the
@@ -1028,7 +1025,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
if (sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode);
+ sb->s_op->dirty_inode(inode, flags);
}
/*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0d0e3faddcf..d5016071459 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,8 +667,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
if (IS_ERR(req))
return PTR_ERR(req);
- dentry_unhash(entry);
-
req->in.h.opcode = FUSE_RMDIR;
req->in.h.nodeid = get_node_id(dir);
req->in.numargs = 1;
@@ -694,9 +692,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
struct fuse_conn *fc = get_fuse_conn(olddir);
struct fuse_req *req = fuse_get_req(fc);
- if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
- dentry_unhash(newent);
-
if (IS_ERR(req))
return PTR_ERR(req);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 1cb70cdba2c..b4d70b13be9 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,9 +253,6 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int res;
- if (S_ISDIR(inode->i_mode))
- dentry_unhash(dentry);
-
if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
return -ENOTEMPTY;
res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -286,9 +283,6 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
- if (S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
res = hfs_remove(new_dir, new_dentry);
if (res)
return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index b28835091dd..4df5059c25d 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,8 +370,6 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int res;
- dentry_unhash(dentry);
-
if (inode->i_size != 2)
return -ENOTEMPTY;
@@ -469,12 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
- if (S_ISDIR(new_dentry->d_inode->i_mode)) {
- dentry_unhash(new_dentry);
+ if (S_ISDIR(new_dentry->d_inode->i_mode))
res = hfsplus_rmdir(new_dir, new_dentry);
- } else {
+ else
res = hfsplus_unlink(new_dir, new_dentry);
- }
if (res)
return res;
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e6816b9e690..2638c834ed2 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,8 +683,6 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
char *file;
int err;
- dentry_unhash(dentry);
-
if ((file = dentry_name(dentry)) == NULL)
return -ENOMEM;
err = do_rmdir(file);
@@ -738,9 +736,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
char *from_name, *to_name;
int err;
- if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
- dentry_unhash(to);
-
if ((from_name = dentry_name(from)) == NULL)
return -ENOMEM;
if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ff0ce21c086..acf95dab2aa 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -439,8 +439,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
int err;
int r;
- dentry_unhash(dentry);
-
hpfs_adjust_length(name, &len);
hpfs_lock(dir->i_sb);
err = -ENOENT;
@@ -535,9 +533,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct fnode *fnode;
int err;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
if ((err = hpfs_chk_name(new_name, &new_len))) return err;
err = 0;
hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/inode.c b/fs/inode.c
index 990d284877a..0f7e88a7803 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1,9 +1,7 @@
/*
- * linux/fs/inode.c
- *
* (C) 1997 Linus Torvalds
+ * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
*/
-
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/dcache.h>
@@ -27,10 +25,11 @@
#include <linux/prefetch.h>
#include <linux/ima.h>
#include <linux/cred.h>
+#include <linux/buffer_head.h> /* for inode_has_buffers */
#include "internal.h"
/*
- * inode locking rules.
+ * Inode locking rules:
*
* inode->i_lock protects:
* inode->i_state, inode->i_hash, __iget()
@@ -60,54 +59,11 @@
* inode_hash_lock
*/
-/*
- * This is needed for the following functions:
- * - inode_has_buffers
- * - invalidate_bdev
- *
- * FIXME: remove all knowledge of the buffer layer from this file
- */
-#include <linux/buffer_head.h>
-
-/*
- * New inode.c implementation.
- *
- * This implementation has the basic premise of trying
- * to be extremely low-overhead and SMP-safe, yet be
- * simple enough to be "obviously correct".
- *
- * Famous last words.
- */
-
-/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
-
-/* #define INODE_PARANOIA 1 */
-/* #define INODE_DEBUG 1 */
-
-/*
- * Inode lookup is no longer as critical as it used to be:
- * most of the lookups are going to be through the dcache.
- */
-#define I_HASHBITS i_hash_shift
-#define I_HASHMASK i_hash_mask
-
static unsigned int i_hash_mask __read_mostly;
static unsigned int i_hash_shift __read_mostly;
static struct hlist_head *inode_hashtable __read_mostly;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
-/*
- * Each inode can be on two separate lists. One is
- * the hash list of the inode, used for lookups. The
- * other linked list is the "type" list:
- * "in_use" - valid inode, i_count > 0, i_nlink > 0
- * "dirty" - as "in_use" but also dirty
- * "unused" - valid inode, i_count = 0
- *
- * A "dirty" list is maintained for each super block,
- * allowing for low-overhead inode sync() operations.
- */
-
static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock);
@@ -424,8 +380,8 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
L1_CACHE_BYTES;
- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
- return tmp & I_HASHMASK;
+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
+ return tmp & i_hash_mask;
}
/**
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9a1e86fc136..4bca6a2e5c0 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -605,8 +605,6 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
int ret;
uint32_t now = get_seconds();
- dentry_unhash(dentry);
-
for (fd = f->dents ; fd; fd = fd->next) {
if (fd->ino)
return -ENOTEMPTY;
@@ -782,9 +780,6 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
uint8_t type;
uint32_t now;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
/* The VFS will check for us and prevent trying to rename a
* file over a directory and vice versa, but if it's a directory,
* the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e896e67767e..46ad619b612 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -357,7 +357,7 @@ error:
return ERR_PTR(ret);
}
-void jffs2_dirty_inode(struct inode *inode)
+void jffs2_dirty_inode(struct inode *inode, int flags)
{
struct iattr iattr;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 00bae7cc2e4..65c6c43ca48 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -172,7 +172,7 @@ int jffs2_setattr (struct dentry *, struct iattr *);
int jffs2_do_setattr (struct inode *, struct iattr *);
struct inode *jffs2_iget(struct super_block *, unsigned long);
void jffs2_evict_inode (struct inode *);
-void jffs2_dirty_inode(struct inode *inode);
+void jffs2_dirty_inode(struct inode *inode, int flags);
struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
struct jffs2_raw_inode *ri);
int jffs2_statfs (struct dentry *, struct kstatfs *);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index eddbb373209..109655904bb 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -173,7 +173,7 @@ void jfs_evict_inode(struct inode *inode)
dquot_drop(inode);
}
-void jfs_dirty_inode(struct inode *inode)
+void jfs_dirty_inode(struct inode *inode, int flags)
{
static int noisy = 5;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 155e91eff07..ec2fb8b945f 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -28,7 +28,7 @@ extern struct inode *jfs_iget(struct super_block *, unsigned long);
extern int jfs_commit_inode(struct inode *, int);
extern int jfs_write_inode(struct inode *, struct writeback_control *);
extern void jfs_evict_inode(struct inode *);
-extern void jfs_dirty_inode(struct inode *);
+extern void jfs_dirty_inode(struct inode *, int);
extern void jfs_truncate(struct inode *);
extern void jfs_truncate_nolock(struct inode *, loff_t);
extern void jfs_free_zero_link(struct inode *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 865df16a6cf..eaaf2b511e8 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,8 +360,6 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
- dentry_unhash(dentry);
-
/* Init inode for quota operations. */
dquot_initialize(dip);
dquot_initialize(ip);
@@ -1097,9 +1095,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
new_dentry->d_name.name);
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
dquot_initialize(old_dir);
dquot_initialize(new_dir);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f34c9cde9e9..9ed89d1663f 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,8 +273,6 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- dentry_unhash(dentry);
-
if (!logfs_empty_dir(inode))
return -ENOTEMPTY;
@@ -624,9 +622,6 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
loff_t pos;
int err;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
/* 1. locate source dd */
err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
if (err)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f60aed8db9c..6e6777f1b4b 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,8 +168,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
struct inode * inode = dentry->d_inode;
int err = -ENOTEMPTY;
- dentry_unhash(dentry);
-
if (minix_empty_dir(inode)) {
err = minix_unlink(dir, dentry);
if (!err) {
@@ -192,9 +190,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
struct minix_dir_entry * old_de;
int err = -ENOENT;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_de = minix_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
diff --git a/fs/namei.c b/fs/namei.c
index 2358b326b22..e2e4e8d032e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -919,12 +919,11 @@ static inline bool managed_dentry_might_block(struct dentry *dentry)
}
/*
- * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
- * meet a managed dentry and we're not walking to "..". True is returned to
- * continue, false to abort.
+ * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
+ * we meet a managed dentry that would need blocking.
*/
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
- struct inode **inode, bool reverse_transit)
+ struct inode **inode)
{
for (;;) {
struct vfsmount *mounted;
@@ -933,8 +932,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
* that wants to block transit.
*/
*inode = path->dentry->d_inode;
- if (!reverse_transit &&
- unlikely(managed_dentry_might_block(path->dentry)))
+ if (unlikely(managed_dentry_might_block(path->dentry)))
return false;
if (!d_mountpoint(path->dentry))
@@ -947,16 +945,24 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
path->dentry = mounted->mnt_root;
nd->seq = read_seqcount_begin(&path->dentry->d_seq);
}
-
- if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
- return reverse_transit;
return true;
}
-static int follow_dotdot_rcu(struct nameidata *nd)
+static void follow_mount_rcu(struct nameidata *nd)
{
- struct inode *inode = nd->inode;
+ while (d_mountpoint(nd->path.dentry)) {
+ struct vfsmount *mounted;
+ mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
+ if (!mounted)
+ break;
+ nd->path.mnt = mounted;
+ nd->path.dentry = mounted->mnt_root;
+ nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+ }
+}
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
set_root_rcu(nd);
while (1) {
@@ -972,7 +978,6 @@ static int follow_dotdot_rcu(struct nameidata *nd)
seq = read_seqcount_begin(&parent->d_seq);
if (read_seqcount_retry(&old->d_seq, nd->seq))
goto failed;
- inode = parent->d_inode;
nd->path.dentry = parent;
nd->seq = seq;
break;
@@ -980,10 +985,9 @@ static int follow_dotdot_rcu(struct nameidata *nd)
if (!follow_up_rcu(&nd->path))
break;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
- inode = nd->path.dentry->d_inode;
}
- __follow_mount_rcu(nd, &nd->path, &inode, true);
- nd->inode = inode;
+ follow_mount_rcu(nd);
+ nd->inode = nd->path.dentry->d_inode;
return 0;
failed:
@@ -1157,8 +1161,11 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
}
path->mnt = mnt;
path->dentry = dentry;
- if (likely(__follow_mount_rcu(nd, path, inode, false)))
- return 0;
+ if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+ goto unlazy;
+ if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+ goto unlazy;
+ return 0;
unlazy:
if (unlazy_walk(nd, dentry))
return -ECHILD;
@@ -2572,6 +2579,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
if (error)
goto out;
+ shrink_dcache_parent(dentry);
error = dir->i_op->rmdir(dir, dentry);
if (error)
goto out;
@@ -2986,6 +2994,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
goto out;
+ if (target)
+ shrink_dcache_parent(new_dentry);
error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
if (error)
goto out;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e3e646b0640..9c51f621e90 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,8 +1033,11 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
DPRINTK("ncp_rmdir: removing %s/%s\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
+ /*
+ * fail with EBUSY if there are still references to this
+ * directory.
+ */
dentry_unhash(dentry);
-
error = -EBUSY;
if (!d_unhashed(dentry))
goto out;
@@ -1141,8 +1144,16 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
+ /*
+ * fail with EBUSY if there are still references to this
+ * directory.
+ */
dentry_unhash(new_dentry);
+ error = -EBUSY;
+ if (!d_unhashed(new_dentry))
+ goto out;
+ }
ncp_age_dentry(server, old_dentry);
ncp_age_dentry(server, new_dentry);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index ba306658a6d..81515545ba7 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -87,6 +87,16 @@ config NFS_V4_1
config PNFS_FILE_LAYOUT
tristate
+config PNFS_OBJLAYOUT
+ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+ help
+ Say M here if you want your pNFS client to support the Objects Layout Driver.
+ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
+ upper level driver (SCSI_OSD_ULD).
+
+ If unsure, say N.
+
config ROOT_NFS
bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4776ff9e381..6a34f7dd0e6 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
delegation.o idmap.o \
callback.o callback_xdr.o callback_proc.o \
nfs4namespace.o
-nfs-$(CONFIG_NFS_V4_1) += pnfs.o
+nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 46d93ce7311..b257383bb56 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall(
extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
extern void nfs4_cb_take_slot(struct nfs_client *clp);
+
+struct cb_devicenotifyitem {
+ uint32_t cbd_notify_type;
+ uint32_t cbd_layout_type;
+ struct nfs4_deviceid cbd_dev_id;
+ uint32_t cbd_immediate;
+};
+
+struct cb_devicenotifyargs {
+ int ndevs;
+ struct cb_devicenotifyitem *devs;
+};
+
+extern __be32 nfs4_callback_devicenotify(
+ struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps);
+
#endif /* CONFIG_NFS_V4_1 */
extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2f41dccea18..d4d1954e9bb 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
mark_matching_lsegs_invalid(lo, &free_me_list,
- args->cbl_range.iomode))
+ &args->cbl_range))
rv = NFS4ERR_DELAY;
else
rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+ if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
rv = NFS4ERR_DELAY;
list_del_init(&lo->plh_bulk_recall);
spin_unlock(&ino->i_lock);
@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
do_callback_layoutrecall(clp, &args);
}
+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps)
+{
+ int i;
+ __be32 res = 0;
+ struct nfs_client *clp = cps->clp;
+ struct nfs_server *server = NULL;
+
+ dprintk("%s: -->\n", __func__);
+
+ if (!clp) {
+ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+ goto out;
+ }
+
+ for (i = 0; i < args->ndevs; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ if (!server ||
+ server->pnfs_curr_ld->id != dev->cbd_layout_type) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ if (server->pnfs_curr_ld &&
+ server->pnfs_curr_ld->id == dev->cbd_layout_type) {
+ rcu_read_unlock();
+ goto found;
+ }
+ rcu_read_unlock();
+ dprintk("%s: layout type %u not found\n",
+ __func__, dev->cbd_layout_type);
+ continue;
+ }
+
+ found:
+ if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
+ dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
+ "deleting instead\n", __func__);
+ nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+ }
+
+out:
+ kfree(args->devs);
+ dprintk("%s: exit with status = %u\n",
+ __func__, be32_to_cpu(res));
+ return res;
+}
+
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
{
if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 00ecf62ce7c..c6c86a77e04 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -25,6 +25,7 @@
#if defined(CONFIG_NFS_V4_1)
#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
4 + 1 + 3)
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
return status;
}
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_devicenotifyargs *args)
+{
+ __be32 *p;
+ __be32 status = 0;
+ u32 tmp;
+ int n, i;
+ args->ndevs = 0;
+
+ /* Num of device notifications */
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ n = ntohl(*p++);
+ if (n <= 0)
+ goto out;
+
+ args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+ if (!args->devs) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out;
+ }
+
+ /* Decode each dev notification */
+ for (i = 0; i < n; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* bitmap size */
+ if (tmp != 1) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_notify_type = ntohl(*p++);
+ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* opaque size */
+ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_layout_type = ntohl(*p++);
+ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+ dev->cbd_immediate = ntohl(*p++);
+ } else {
+ dev->cbd_immediate = 0;
+ }
+
+ args->ndevs++;
+
+ dprintk("%s: type %d layout 0x%x immediate %d\n",
+ __func__, dev->cbd_notify_type, dev->cbd_layout_type,
+ dev->cbd_immediate);
+ }
+out:
+ dprintk("%s: status %d ndevs %d\n",
+ __func__, ntohl(status), args->ndevs);
+ return status;
+err:
+ kfree(args->devs);
+ goto out;
+}
+
static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_RECALL_ANY:
case OP_CB_RECALL_SLOT:
case OP_CB_LAYOUTRECALL:
+ case OP_CB_NOTIFY_DEVICEID:
*op = &callback_ops[op_nr];
break;
- case OP_CB_NOTIFY_DEVICEID:
case OP_CB_NOTIFY:
case OP_CB_PUSH_DELEG:
case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
(callback_decode_arg_t)decode_layoutrecall_args,
.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
},
+ [OP_CB_NOTIFY_DEVICEID] = {
+ .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+ .decode_args =
+ (callback_decode_arg_t)decode_devicenotify_args,
+ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+ },
[OP_CB_SEQUENCE] = {
.process_op = (callback_process_op_t)nfs4_callback_sequence,
.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 139be9647d8..b3dc2b88b65 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp)
if (clp->cl_machine_cred != NULL)
put_rpccred(clp->cl_machine_cred);
+ nfs4_deviceid_purge_client(clp);
+
kfree(clp->cl_hostname);
kfree(clp);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index bbbc6bf5cb2..dd25c2aec37 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -21,25 +21,13 @@
#include "delegation.h"
#include "internal.h"
-static void nfs_do_free_delegation(struct nfs_delegation *delegation)
-{
- kfree(delegation);
-}
-
-static void nfs_free_delegation_callback(struct rcu_head *head)
-{
- struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu);
-
- nfs_do_free_delegation(delegation);
-}
-
static void nfs_free_delegation(struct nfs_delegation *delegation)
{
if (delegation->cred) {
put_rpccred(delegation->cred);
delegation->cred = NULL;
}
- call_rcu(&delegation->rcu, nfs_free_delegation_callback);
+ kfree_rcu(delegation, rcu);
}
/**
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 424e47773a8..ededdbd0db3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
struct page **xdr_pages, struct page *page, unsigned int buflen)
{
struct xdr_stream stream;
- struct xdr_buf buf = {
- .pages = xdr_pages,
- .page_len = buflen,
- .buflen = buflen,
- .len = buflen,
- };
+ struct xdr_buf buf;
struct page *scratch;
struct nfs_cache_array *array;
unsigned int count = 0;
@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
if (scratch == NULL)
return -ENOMEM;
- xdr_init_decode(&stream, &buf, NULL);
+ xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
do {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 57bb31ad7a5..144f2a3c718 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1298,8 +1298,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
}
- dprintk("NFS: isize change on server for file %s/%ld\n",
- inode->i_sb->s_id, inode->i_ino);
+ dprintk("NFS: isize change on server for file %s/%ld "
+ "(%Ld to %Ld)\n",
+ inode->i_sb->s_id,
+ inode->i_ino,
+ (long long)cur_isize,
+ (long long)new_isize);
}
} else
invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
@@ -1424,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
void nfs4_evict_inode(struct inode *inode)
{
- pnfs_destroy_layout(NFS_I(inode));
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
+ pnfs_return_layout(inode);
+ pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
/* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2df6ca7b589..b9056cbe68d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *,
#endif
/* nfs4proc.c */
+extern void __nfs4_read_done_cb(struct nfs_read_data *);
extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
extern int nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be79dc9f386..426908809c9 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
struct nfs4_deviceid *id,
gfp_t gfp_flags)
{
+ struct nfs4_deviceid_node *d;
struct nfs4_file_layout_dsaddr *dsaddr;
int status = -EINVAL;
struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
@@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
if (fl->pattern_offset > lgr->range.offset) {
- dprintk("%s pattern_offset %lld to large\n",
+ dprintk("%s pattern_offset %lld too large\n",
__func__, fl->pattern_offset);
goto out;
}
@@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
}
/* find and reference the deviceid */
- dsaddr = nfs4_fl_find_get_deviceid(id);
- if (dsaddr == NULL) {
+ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
+ NFS_SERVER(lo->plh_inode)->nfs_client, id);
+ if (d == NULL) {
dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
if (dsaddr == NULL)
goto out;
- }
+ } else
+ dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
fl->dsaddr = dsaddr;
if (fl->first_stripe_index < 0 ||
@@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
gfp_t gfp_flags)
{
struct xdr_stream stream;
- struct xdr_buf buf = {
- .pages = lgr->layoutp->pages,
- .page_len = lgr->layoutp->len,
- .buflen = lgr->layoutp->len,
- .len = lgr->layoutp->len,
- };
+ struct xdr_buf buf;
struct page *scratch;
__be32 *p;
uint32_t nfl_util;
@@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
if (!scratch)
return -ENOMEM;
- xdr_init_decode(&stream, &buf, NULL);
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
@@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
memcpy(id, p, sizeof(*id));
p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
- print_deviceid(id);
+ nfs4_print_deviceid(id);
nfl_util = be32_to_cpup(p++);
if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
@@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
/*
* filelayout_pg_test(). Called by nfs_can_coalesce_requests()
*
- * return 1 : coalesce page
- * return 0 : don't coalesce page
+ * return true : coalesce page
+ * return false : don't coalesce page
*/
-int
+bool
filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
u64 p_stripe, r_stripe;
u32 stripe_unit;
+ if (!pnfs_generic_pg_test(pgio, prev, req))
+ return 0;
+
if (!pgio->pg_lseg)
return 1;
p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
@@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
return -ENOMEM;
}
+static void
+filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+ nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
+}
+
static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
@@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
+ .free_deviceid_node = filelayout_free_deveiceid_node,
};
static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2b461d77b43..cebe01e3795 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -59,9 +59,7 @@ struct nfs4_pnfs_ds {
#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
struct nfs4_file_layout_dsaddr {
- struct hlist_node node;
- struct nfs4_deviceid deviceid;
- atomic_t ref;
+ struct nfs4_deviceid_node id_node;
unsigned long flags;
u32 stripe_count;
u8 *stripe_indices;
@@ -95,14 +93,12 @@ extern struct nfs_fh *
nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
extern void print_ds(struct nfs4_pnfs_ds *ds);
-extern void print_deviceid(struct nfs4_deviceid *dev_id);
u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
u32 ds_idx);
-extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
struct nfs4_file_layout_dsaddr *
get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index db07c7af139..3b7bf137726 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,30 +37,6 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_FL_DEVICE_ID_HASH_BITS 5
-#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
-#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
-
-static inline u32
-nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
-{
- unsigned char *cptr = (unsigned char *)id->data;
- unsigned int nbytes = NFS4_DEVICEID4_SIZE;
- u32 x = 0;
-
- while (nbytes--) {
- x *= 37;
- x += *cptr++;
- }
- return x & NFS4_FL_DEVICE_ID_HASH_MASK;
-}
-
-static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
-static DEFINE_SPINLOCK(filelayout_deviceid_lock);
-
-/*
* Data server cache
*
* Data servers can be mapped to different device ids.
@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds)
ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
}
-void
-print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
-{
- int i;
-
- ifdebug(FACILITY) {
- printk("%s dsaddr->ds_num %d\n", __func__,
- dsaddr->ds_num);
- for (i = 0; i < dsaddr->ds_num; i++)
- print_ds(dsaddr->ds_list[i]);
- }
-}
-
-void print_deviceid(struct nfs4_deviceid *id)
-{
- u32 *p = (u32 *)id;
-
- dprintk("%s: device id= [%x%x%x%x]\n", __func__,
- p[0], p[1], p[2], p[3]);
-}
-
/* nfs4_ds_cache_lock is held */
static struct nfs4_pnfs_ds *
_data_server_lookup_locked(u32 ip_addr, u32 port)
@@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
kfree(ds);
}
-static void
+void
nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
struct nfs4_pnfs_ds *ds;
int i;
- print_deviceid(&dsaddr->deviceid);
+ nfs4_print_deviceid(&dsaddr->id_node.deviceid);
for (i = 0; i < dsaddr->ds_num; i++) {
ds = dsaddr->ds_list[i];
@@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
u8 max_stripe_index;
struct nfs4_file_layout_dsaddr *dsaddr = NULL;
struct xdr_stream stream;
- struct xdr_buf buf = {
- .pages = pdev->pages,
- .page_len = pdev->pglen,
- .buflen = pdev->pglen,
- .len = pdev->pglen,
- };
+ struct xdr_buf buf;
struct page *scratch;
/* set up xdr stream */
@@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
if (!scratch)
goto out_err;
- xdr_init_decode(&stream, &buf, NULL);
+ xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
/* Get the stripe count (number of stripe index) */
@@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
dsaddr->stripe_indices = stripe_indices;
stripe_indices = NULL;
dsaddr->ds_num = num;
-
- memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
+ nfs4_init_deviceid_node(&dsaddr->id_node,
+ NFS_SERVER(ino)->pnfs_curr_ld,
+ NFS_SERVER(ino)->nfs_client,
+ &pdev->dev_id);
for (i = 0; i < dsaddr->ds_num; i++) {
int j;
@@ -505,8 +457,8 @@ out_err:
static struct nfs4_file_layout_dsaddr *
decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
{
- struct nfs4_file_layout_dsaddr *d, *new;
- long hash;
+ struct nfs4_deviceid_node *d;
+ struct nfs4_file_layout_dsaddr *n, *new;
new = decode_device(inode, dev, gfp_flags);
if (!new) {
@@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
return NULL;
}
- spin_lock(&filelayout_deviceid_lock);
- d = nfs4_fl_find_get_deviceid(&new->deviceid);
- if (d) {
- spin_unlock(&filelayout_deviceid_lock);
+ d = nfs4_insert_deviceid_node(&new->id_node);
+ n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+ if (n != new) {
nfs4_fl_free_deviceid(new);
- return d;
+ return n;
}
- INIT_HLIST_NODE(&new->node);
- atomic_set(&new->ref, 1);
- hash = nfs4_fl_deviceid_hash(&new->deviceid);
- hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
- spin_unlock(&filelayout_deviceid_lock);
-
return new;
}
@@ -600,35 +545,7 @@ out_free:
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
- if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
- hlist_del_rcu(&dsaddr->node);
- spin_unlock(&filelayout_deviceid_lock);
-
- synchronize_rcu();
- nfs4_fl_free_deviceid(dsaddr);
- }
-}
-
-struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
-{
- struct nfs4_file_layout_dsaddr *d;
- struct hlist_node *n;
- long hash = nfs4_fl_deviceid_hash(id);
-
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
- if (!memcmp(&d->deviceid, id, sizeof(*id))) {
- if (!atomic_inc_not_zero(&d->ref))
- goto fail;
- rcu_read_unlock();
- return d;
- }
- }
-fail:
- rcu_read_unlock();
- return NULL;
+ nfs4_put_deviceid_node(&dsaddr->id_node);
}
/*
@@ -676,15 +593,15 @@ static void
filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
int err, u32 ds_addr)
{
- u32 *p = (u32 *)&dsaddr->deviceid;
+ u32 *p = (u32 *)&dsaddr->id_node.deviceid;
printk(KERN_ERR "NFS: data server %x connection error %d."
" Deviceid [%x%x%x%x] marked out of use.\n",
ds_addr, err, p[0], p[1], p[2], p[3]);
- spin_lock(&filelayout_deviceid_lock);
+ spin_lock(&nfs4_ds_cache_lock);
dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
- spin_unlock(&filelayout_deviceid_lock);
+ spin_unlock(&nfs4_ds_cache_lock);
}
struct nfs4_pnfs_ds *
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf1b339c393..d2c4b59c896 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -267,9 +267,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
break;
nfs4_schedule_stateid_recovery(server, state);
goto wait_on_recovery;
+ case -NFS4ERR_EXPIRED:
+ if (state != NULL)
+ nfs4_schedule_stateid_recovery(server, state);
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_EXPIRED:
nfs4_schedule_lease_recovery(clp);
goto wait_on_recovery;
#if defined(CONFIG_NFS_V4_1)
@@ -2361,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct nfs4_state *state = NULL;
int status;
+ if (pnfs_ld_layoutret_on_setattr(inode))
+ pnfs_return_layout(inode);
+
nfs_fattr_init(fattr);
/* Search for an existing open(O_WRITE) file */
@@ -3175,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return err;
}
+void __nfs4_read_done_cb(struct nfs_read_data *data)
+{
+ nfs_invalidate_atime(data->inode);
+}
+
static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
{
struct nfs_server *server = NFS_SERVER(data->inode);
@@ -3184,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
return -EAGAIN;
}
- nfs_invalidate_atime(data->inode);
+ __nfs4_read_done_cb(data);
if (task->tk_status > 0)
renew_lease(server, data->timestamp);
return 0;
@@ -3198,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
- return data->read_done_cb(task, data);
+ return data->read_done_cb ? data->read_done_cb(task, data) :
+ nfs4_read_done_cb(task, data);
}
static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
@@ -3243,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
{
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
- return data->write_done_cb(task, data);
+ return data->write_done_cb ? data->write_done_cb(task, data) :
+ nfs4_write_done_cb(task, data);
}
/* Reset the the nfs_write_data to send the write to the MDS. */
@@ -3670,9 +3682,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
break;
nfs4_schedule_stateid_recovery(server, state);
goto wait_on_recovery;
+ case -NFS4ERR_EXPIRED:
+ if (state != NULL)
+ nfs4_schedule_stateid_recovery(server, state);
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_EXPIRED:
nfs4_schedule_lease_recovery(clp);
goto wait_on_recovery;
#if defined(CONFIG_NFS_V4_1)
@@ -4543,6 +4557,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
case -ESTALE:
goto out;
case -NFS4ERR_EXPIRED:
+ nfs4_schedule_stateid_recovery(server, state);
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
nfs4_schedule_lease_recovery(server->nfs_client);
@@ -5666,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
return status;
}
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
+ &lrp->res.seq_res, 0, task))
+ return;
+ rpc_call_start(task);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+ struct nfs_server *server;
+
+ dprintk("--> %s\n", __func__);
+
+ if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+ return;
+
+ server = NFS_SERVER(lrp->args.inode);
+ if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+ nfs_restart_rpc(task, lrp->clp);
+ return;
+ }
+ if (task->tk_status == 0) {
+ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+
+ if (lrp->res.lrs_present) {
+ spin_lock(&lo->plh_inode->i_lock);
+ pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+ spin_unlock(&lo->plh_inode->i_lock);
+ } else
+ BUG_ON(!list_empty(&lo->plh_segs));
+ }
+ dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ put_layout_hdr(NFS_I(lrp->args.inode)->layout);
+ kfree(calldata);
+ dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+ .rpc_call_prepare = nfs4_layoutreturn_prepare,
+ .rpc_call_done = nfs4_layoutreturn_done,
+ .rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+ .rpc_argp = &lrp->args,
+ .rpc_resp = &lrp->res,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = lrp->clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutreturn_call_ops,
+ .callback_data = lrp,
+ };
+ int status;
+
+ dprintk("--> %s\n", __func__);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = task->tk_status;
+ dprintk("<-- %s status=%d\n", __func__, status);
+ rpc_put_task(task);
+ return status;
+}
+
static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
{
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 036f5adc9e1..e97dd219f84 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1466,7 +1466,10 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
#ifdef CONFIG_NFS_V4_1
void nfs4_schedule_session_recovery(struct nfs4_session *session)
{
- nfs4_schedule_lease_recovery(session->clp);
+ struct nfs_client *clp = session->clp;
+
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ nfs4_schedule_lease_recovery(clp);
}
EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
@@ -1549,6 +1552,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
status = nfs4_recovery_handle_error(clp, status);
goto out;
}
+ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
/* create_session negotiated new slot table */
clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c3ccd2c4683..d869a5e5464 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int);
1 /* layoutupdate4 layout type */ + \
1 /* NULL filelayout layoutupdate4 payload */)
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
-
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 1 /* FIXME: opaque lrf_body always empty at the moment */)
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+ 1 + decode_stateid_maxsz)
#else /* CONFIG_NFS_V4_1 */
#define encode_sequence_maxsz 0
#define decode_sequence_maxsz 0
@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int);
decode_putfh_maxsz + \
decode_layoutcommit_maxsz + \
decode_getattr_maxsz)
-
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz)
const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
compound_encode_hdr_maxsz +
@@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr,
static int
encode_layoutcommit(struct xdr_stream *xdr,
+ struct inode *inode,
const struct nfs4_layoutcommit_args *args,
struct compound_hdr *hdr)
{
@@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
NFS_SERVER(args->inode)->pnfs_curr_ld->id);
- p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
*p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
/* Only whole file layouts */
p = xdr_encode_hyper(p, 0); /* offset */
@@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->lastbytewritten);
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
- *p++ = cpu_to_be32(0); /* no file layout payload */
+
+ if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+ NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
+ NFS_I(inode)->layout, xdr, args);
+ else {
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(0); /* no layout-type payload */
+ }
hdr->nops++;
hdr->replen += decode_layoutcommit_maxsz;
return 0;
}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(OP_LAYOUTRETURN);
+ *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
+ *p++ = cpu_to_be32(args->layout_type);
+ *p++ = cpu_to_be32(IOMODE_ANY);
+ *p = cpu_to_be32(RETURN_FILE);
+ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+ p = xdr_encode_hyper(p, 0);
+ p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+ spin_lock(&args->inode->i_lock);
+ xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+ spin_unlock(&args->inode->i_lock);
+ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+ NFS_I(args->inode)->layout, xdr, args);
+ } else {
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(0);
+ }
+ hdr->nops++;
+ hdr->replen += decode_layoutreturn_maxsz;
+}
#endif /* CONFIG_NFS_V4_1 */
/*
@@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
/*
* Encode LAYOUTCOMMIT request
*/
-static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
- struct xdr_stream *xdr,
- struct nfs4_layoutcommit_args *args)
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutcommit_args *args)
{
+ struct nfs4_layoutcommit_data *data =
+ container_of(args, struct nfs4_layoutcommit_data, args);
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
@@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, NFS_FH(args->inode), &hdr);
- encode_layoutcommit(xdr, args, &hdr);
+ encode_layoutcommit(xdr, data->args.inode, args, &hdr);
encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
- return 0;
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutreturn(xdr, args, &hdr);
+ encode_nops(&hdr);
}
#endif /* CONFIG_NFS_V4_1 */
@@ -5203,6 +5271,27 @@ out_overflow:
return -EIO;
}
+static int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->lrs_present = be32_to_cpup(p);
+ if (res->lrs_present)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
static int decode_layoutcommit(struct xdr_stream *xdr,
struct rpc_rqst *req,
struct nfs4_layoutcommit_res *res)
@@ -6320,6 +6409,30 @@ out:
}
/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutreturn(xdr, res);
+out:
+ return status;
+}
+
+/*
* Decode LAYOUTCOMMIT response
*/
static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
@@ -6547,6 +6660,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
+ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
#endif /* CONFIG_NFS_V4_1 */
};
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c541093a5bf..c4744e1d513 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -87,7 +87,7 @@
#define NFS_ROOT "/tftpboot/%s"
/* Default NFSROOT mount options. */
-#define NFS_DEF_OPTIONS "udp"
+#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
/* Parameters passed from the kernel command line */
static char nfs_root_parms[256] __initdata = "";
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 00000000000..ed30ea072bb
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Objects Layout Driver kernel module
+#
+objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 00000000000..9cf208df1f2
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,1057 @@
+/*
+ * pNFS Objects layout implementation over open-osd initiator library
+ *
+ * Copyright (C) 2009 Panasas Inc. [year of first publication]
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <scsi/osd_initiator.h>
+
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+#define _LLU(x) ((unsigned long long)x)
+
+enum { BIO_MAX_PAGES_KMALLOC =
+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
+
+struct objio_dev_ent {
+ struct nfs4_deviceid_node id_node;
+ struct osd_dev *od;
+};
+
+static void
+objio_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
+
+ dprintk("%s: free od=%p\n", __func__, de->od);
+ osduld_put_device(de->od);
+ kfree(de);
+}
+
+static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
+ const struct nfs4_deviceid *d_id)
+{
+ struct nfs4_deviceid_node *d;
+ struct objio_dev_ent *de;
+
+ d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
+ if (!d)
+ return NULL;
+
+ de = container_of(d, struct objio_dev_ent, id_node);
+ return de;
+}
+
+static struct objio_dev_ent *
+_dev_list_add(const struct nfs_server *nfss,
+ const struct nfs4_deviceid *d_id, struct osd_dev *od,
+ gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d;
+ struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
+ struct objio_dev_ent *n;
+
+ if (!de) {
+ dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+ return NULL;
+ }
+
+ dprintk("%s: Adding od=%p\n", __func__, od);
+ nfs4_init_deviceid_node(&de->id_node,
+ nfss->pnfs_curr_ld,
+ nfss->nfs_client,
+ d_id);
+ de->od = od;
+
+ d = nfs4_insert_deviceid_node(&de->id_node);
+ n = container_of(d, struct objio_dev_ent, id_node);
+ if (n != de) {
+ dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+ objio_free_deviceid_node(&de->id_node);
+ de = n;
+ }
+
+ atomic_inc(&de->id_node.ref);
+ return de;
+}
+
+struct caps_buffers {
+ u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
+ u8 creds[OSD_CAP_LEN];
+};
+
+struct objio_segment {
+ struct pnfs_layout_segment lseg;
+
+ struct pnfs_osd_object_cred *comps;
+
+ unsigned mirrors_p1;
+ unsigned stripe_unit;
+ unsigned group_width; /* Data stripe_units without integrity comps */
+ u64 group_depth;
+ unsigned group_count;
+
+ unsigned max_io_size;
+
+ unsigned comps_index;
+ unsigned num_comps;
+ /* variable length */
+ struct objio_dev_ent *ods[];
+};
+
+static inline struct objio_segment *
+OBJIO_LSEG(struct pnfs_layout_segment *lseg)
+{
+ return container_of(lseg, struct objio_segment, lseg);
+}
+
+struct objio_state;
+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+
+struct objio_state {
+ /* Generic layer */
+ struct objlayout_io_state ol_state;
+
+ struct objio_segment *layout;
+
+ struct kref kref;
+ objio_done_fn done;
+ void *private;
+
+ unsigned long length;
+ unsigned numdevs; /* Actually used devs in this IO */
+ /* A per-device variable array of size numdevs */
+ struct _objio_per_comp {
+ struct bio *bio;
+ struct osd_request *or;
+ unsigned long length;
+ u64 offset;
+ unsigned dev;
+ } per_dev[];
+};
+
+/* Send and wait for a get_device_info of devices in the layout,
+ then look them up with the osd_initiator library */
+static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg, unsigned comp,
+ gfp_t gfp_flags)
+{
+ struct pnfs_osd_deviceaddr *deviceaddr;
+ struct nfs4_deviceid *d_id;
+ struct objio_dev_ent *ode;
+ struct osd_dev *od;
+ struct osd_dev_info odi;
+ int err;
+
+ d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
+
+ ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
+ if (ode)
+ return ode;
+
+ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
+ if (unlikely(err)) {
+ dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
+ __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
+ return ERR_PTR(err);
+ }
+
+ odi.systemid_len = deviceaddr->oda_systemid.len;
+ if (odi.systemid_len > sizeof(odi.systemid)) {
+ err = -EINVAL;
+ goto out;
+ } else if (odi.systemid_len)
+ memcpy(odi.systemid, deviceaddr->oda_systemid.data,
+ odi.systemid_len);
+ odi.osdname_len = deviceaddr->oda_osdname.len;
+ odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
+
+ if (!odi.osdname_len && !odi.systemid_len) {
+ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
+ __func__);
+ err = -ENODEV;
+ goto out;
+ }
+
+ od = osduld_info_lookup(&odi);
+ if (unlikely(IS_ERR(od))) {
+ err = PTR_ERR(od);
+ dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+ goto out;
+ }
+
+ ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
+ gfp_flags);
+
+out:
+ dprintk("%s: return=%d\n", __func__, err);
+ objlayout_put_deviceinfo(deviceaddr);
+ return err ? ERR_PTR(err) : ode;
+}
+
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg,
+ gfp_t gfp_flags)
+{
+ unsigned i;
+ int err;
+
+ /* lookup all devices */
+ for (i = 0; i < objio_seg->num_comps; i++) {
+ struct objio_dev_ent *ode;
+
+ ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
+ if (unlikely(IS_ERR(ode))) {
+ err = PTR_ERR(ode);
+ goto out;
+ }
+ objio_seg->ods[i] = ode;
+ }
+ err = 0;
+
+out:
+ dprintk("%s: return=%d\n", __func__, err);
+ return err;
+}
+
+static int _verify_data_map(struct pnfs_osd_layout *layout)
+{
+ struct pnfs_osd_data_map *data_map = &layout->olo_map;
+ u64 stripe_length;
+ u32 group_width;
+
+/* FIXME: Only raid0 for now. if not go through MDS */
+ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
+ printk(KERN_ERR "Only RAID_0 for now\n");
+ return -ENOTSUPP;
+ }
+ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
+ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
+ data_map->odm_num_comps, data_map->odm_mirror_cnt);
+ return -EINVAL;
+ }
+
+ if (data_map->odm_group_width)
+ group_width = data_map->odm_group_width;
+ else
+ group_width = data_map->odm_num_comps /
+ (data_map->odm_mirror_cnt + 1);
+
+ stripe_length = (u64)data_map->odm_stripe_unit * group_width;
+ if (stripe_length >= (1ULL << 32)) {
+ printk(KERN_ERR "Total Stripe length(0x%llx)"
+ " >= 32bit is not supported\n", _LLU(stripe_length));
+ return -ENOTSUPP;
+ }
+
+ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+ printk(KERN_ERR "Stripe Unit(0x%llx)"
+ " must be Multples of PAGE_SIZE(0x%lx)\n",
+ _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
+ return -ENOTSUPP;
+ }
+
+ return 0;
+}
+
+static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
+ struct pnfs_osd_object_cred *src_comp,
+ struct caps_buffers *caps_p)
+{
+ WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
+ WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
+
+ *cur_comp = *src_comp;
+
+ memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
+ sizeof(caps_p->caps_key));
+ cur_comp->oc_cap_key.cred = caps_p->caps_key;
+
+ memcpy(caps_p->creds, src_comp->oc_cap.cred,
+ sizeof(caps_p->creds));
+ cur_comp->oc_cap.cred = caps_p->creds;
+}
+
+int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+ struct pnfs_layout_hdr *pnfslay,
+ struct pnfs_layout_range *range,
+ struct xdr_stream *xdr,
+ gfp_t gfp_flags)
+{
+ struct objio_segment *objio_seg;
+ struct pnfs_osd_xdr_decode_layout_iter iter;
+ struct pnfs_osd_layout layout;
+ struct pnfs_osd_object_cred *cur_comp, src_comp;
+ struct caps_buffers *caps_p;
+ int err;
+
+ err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
+ if (unlikely(err))
+ return err;
+
+ err = _verify_data_map(&layout);
+ if (unlikely(err))
+ return err;
+
+ objio_seg = kzalloc(sizeof(*objio_seg) +
+ sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
+ sizeof(*objio_seg->comps) * layout.olo_num_comps +
+ sizeof(struct caps_buffers) * layout.olo_num_comps,
+ gfp_flags);
+ if (!objio_seg)
+ return -ENOMEM;
+
+ objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
+ cur_comp = objio_seg->comps;
+ caps_p = (void *)(cur_comp + layout.olo_num_comps);
+ while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
+ copy_single_comp(cur_comp++, &src_comp, caps_p++);
+ if (unlikely(err))
+ goto err;
+
+ objio_seg->num_comps = layout.olo_num_comps;
+ objio_seg->comps_index = layout.olo_comps_index;
+ err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
+ if (err)
+ goto err;
+
+ objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+ objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
+ if (layout.olo_map.odm_group_width) {
+ objio_seg->group_width = layout.olo_map.odm_group_width;
+ objio_seg->group_depth = layout.olo_map.odm_group_depth;
+ objio_seg->group_count = layout.olo_map.odm_num_comps /
+ objio_seg->mirrors_p1 /
+ objio_seg->group_width;
+ } else {
+ objio_seg->group_width = layout.olo_map.odm_num_comps /
+ objio_seg->mirrors_p1;
+ objio_seg->group_depth = -1;
+ objio_seg->group_count = 1;
+ }
+
+ /* Cache this calculation it will hit for every page */
+ objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
+ objio_seg->stripe_unit) *
+ objio_seg->group_width;
+
+ *outp = &objio_seg->lseg;
+ return 0;
+
+err:
+ kfree(objio_seg);
+ dprintk("%s: Error: return %d\n", __func__, err);
+ *outp = NULL;
+ return err;
+}
+
+void objio_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ int i;
+ struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+
+ for (i = 0; i < objio_seg->num_comps; i++) {
+ if (!objio_seg->ods[i])
+ break;
+ nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+ }
+ kfree(objio_seg);
+}
+
+int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
+ struct objlayout_io_state **outp,
+ gfp_t gfp_flags)
+{
+ struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+ struct objio_state *ios;
+ const unsigned first_size = sizeof(*ios) +
+ objio_seg->num_comps * sizeof(ios->per_dev[0]);
+ const unsigned sec_size = objio_seg->num_comps *
+ sizeof(ios->ol_state.ioerrs[0]);
+
+ ios = kzalloc(first_size + sec_size, gfp_flags);
+ if (unlikely(!ios))
+ return -ENOMEM;
+
+ ios->layout = objio_seg;
+ ios->ol_state.ioerrs = ((void *)ios) + first_size;
+ ios->ol_state.num_comps = objio_seg->num_comps;
+
+ *outp = &ios->ol_state;
+ return 0;
+}
+
+void objio_free_io_state(struct objlayout_io_state *ol_state)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+
+ kfree(ios);
+}
+
+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
+{
+ switch (oep) {
+ case OSD_ERR_PRI_NO_ERROR:
+ return (enum pnfs_osd_errno)0;
+
+ case OSD_ERR_PRI_CLEAR_PAGES:
+ BUG_ON(1);
+ return 0;
+
+ case OSD_ERR_PRI_RESOURCE:
+ return PNFS_OSD_ERR_RESOURCE;
+ case OSD_ERR_PRI_BAD_CRED:
+ return PNFS_OSD_ERR_BAD_CRED;
+ case OSD_ERR_PRI_NO_ACCESS:
+ return PNFS_OSD_ERR_NO_ACCESS;
+ case OSD_ERR_PRI_UNREACHABLE:
+ return PNFS_OSD_ERR_UNREACHABLE;
+ case OSD_ERR_PRI_NOT_FOUND:
+ return PNFS_OSD_ERR_NOT_FOUND;
+ case OSD_ERR_PRI_NO_SPACE:
+ return PNFS_OSD_ERR_NO_SPACE;
+ default:
+ WARN_ON(1);
+ /* fallthrough */
+ case OSD_ERR_PRI_EIO:
+ return PNFS_OSD_ERR_EIO;
+ }
+}
+
+static void _clear_bio(struct bio *bio)
+{
+ struct bio_vec *bv;
+ unsigned i;
+
+ __bio_for_each_segment(bv, bio, i, 0) {
+ unsigned this_count = bv->bv_len;
+
+ if (likely(PAGE_SIZE == this_count))
+ clear_highpage(bv->bv_page);
+ else
+ zero_user(bv->bv_page, bv->bv_offset, this_count);
+ }
+}
+
+static int _io_check(struct objio_state *ios, bool is_write)
+{
+ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
+ int lin_ret = 0;
+ int i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_sense_info osi;
+ struct osd_request *or = ios->per_dev[i].or;
+ unsigned dev;
+ int ret;
+
+ if (!or)
+ continue;
+
+ ret = osd_req_decode_sense(or, &osi);
+ if (likely(!ret))
+ continue;
+
+ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+ /* start read offset passed endof file */
+ BUG_ON(is_write);
+ _clear_bio(ios->per_dev[i].bio);
+ dprintk("%s: start read offset passed end of file "
+ "offset=0x%llx, length=0x%lx\n", __func__,
+ _LLU(ios->per_dev[i].offset),
+ ios->per_dev[i].length);
+
+ continue; /* we recovered */
+ }
+ dev = ios->per_dev[i].dev;
+ objlayout_io_set_result(&ios->ol_state, dev,
+ &ios->layout->comps[dev].oc_object_id,
+ osd_pri_2_pnfs_err(osi.osd_err_pri),
+ ios->per_dev[i].offset,
+ ios->per_dev[i].length,
+ is_write);
+
+ if (osi.osd_err_pri >= oep) {
+ oep = osi.osd_err_pri;
+ lin_ret = ret;
+ }
+ }
+
+ return lin_ret;
+}
+
+/*
+ * Common IO state helpers.
+ */
+static void _io_free(struct objio_state *ios)
+{
+ unsigned i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct _objio_per_comp *per_dev = &ios->per_dev[i];
+
+ if (per_dev->or) {
+ osd_end_request(per_dev->or);
+ per_dev->or = NULL;
+ }
+
+ if (per_dev->bio) {
+ bio_put(per_dev->bio);
+ per_dev->bio = NULL;
+ }
+ }
+}
+
+struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
+{
+ unsigned min_dev = ios->layout->comps_index;
+ unsigned max_dev = min_dev + ios->layout->num_comps;
+
+ BUG_ON(dev < min_dev || max_dev <= dev);
+ return ios->layout->ods[dev - min_dev]->od;
+}
+
+struct _striping_info {
+ u64 obj_offset;
+ u64 group_length;
+ unsigned dev;
+ unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
+ struct _striping_info *si)
+{
+ u32 stripe_unit = ios->layout->stripe_unit;
+ u32 group_width = ios->layout->group_width;
+ u64 group_depth = ios->layout->group_depth;
+ u32 U = stripe_unit * group_width;
+
+ u64 T = U * group_depth;
+ u64 S = T * ios->layout->group_count;
+ u64 M = div64_u64(file_offset, S);
+
+ /*
+ G = (L - (M * S)) / T
+ H = (L - (M * S)) % T
+ */
+ u64 LmodU = file_offset - M * S;
+ u32 G = div64_u64(LmodU, T);
+ u64 H = LmodU - G * T;
+
+ u32 N = div_u64(H, U);
+
+ div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+ si->obj_offset = si->unit_off + (N * stripe_unit) +
+ (M * group_depth * stripe_unit);
+
+ /* "H - (N * U)" is just "H % U" so it's bound to u32 */
+ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+ si->dev *= ios->layout->mirrors_p1;
+
+ si->group_length = T - H;
+}
+
+static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
+ gfp_t gfp_flags)
+{
+ unsigned pg = *cur_pg;
+ struct request_queue *q =
+ osd_request_queue(_io_od(ios, per_dev->dev));
+
+ per_dev->length += cur_len;
+
+ if (per_dev->bio == NULL) {
+ unsigned stripes = ios->layout->num_comps /
+ ios->layout->mirrors_p1;
+ unsigned pages_in_stripe = stripes *
+ (ios->layout->stripe_unit / PAGE_SIZE);
+ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+ stripes;
+
+ if (BIO_MAX_PAGES_KMALLOC < bio_size)
+ bio_size = BIO_MAX_PAGES_KMALLOC;
+
+ per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
+ if (unlikely(!per_dev->bio)) {
+ dprintk("Faild to allocate BIO size=%u\n", bio_size);
+ return -ENOMEM;
+ }
+ }
+
+ while (cur_len > 0) {
+ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+ unsigned added_len;
+
+ BUG_ON(ios->ol_state.nr_pages <= pg);
+ cur_len -= pglen;
+
+ added_len = bio_add_pc_page(q, per_dev->bio,
+ ios->ol_state.pages[pg], pglen, pgbase);
+ if (unlikely(pglen != added_len))
+ return -ENOMEM;
+ pgbase = 0;
+ ++pg;
+ }
+ BUG_ON(cur_len);
+
+ *cur_pg = pg;
+ return 0;
+}
+
+static int _prepare_one_group(struct objio_state *ios, u64 length,
+ struct _striping_info *si, unsigned *last_pg,
+ gfp_t gfp_flags)
+{
+ unsigned stripe_unit = ios->layout->stripe_unit;
+ unsigned mirrors_p1 = ios->layout->mirrors_p1;
+ unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+ unsigned dev = si->dev;
+ unsigned first_dev = dev - (dev % devs_in_group);
+ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+ unsigned cur_pg = *last_pg;
+ int ret = 0;
+
+ while (length) {
+ struct _objio_per_comp *per_dev = &ios->per_dev[dev];
+ unsigned cur_len, page_off = 0;
+
+ if (!per_dev->length) {
+ per_dev->dev = dev;
+ if (dev < si->dev) {
+ per_dev->offset = si->obj_offset + stripe_unit -
+ si->unit_off;
+ cur_len = stripe_unit;
+ } else if (dev == si->dev) {
+ per_dev->offset = si->obj_offset;
+ cur_len = stripe_unit - si->unit_off;
+ page_off = si->unit_off & ~PAGE_MASK;
+ BUG_ON(page_off &&
+ (page_off != ios->ol_state.pgbase));
+ } else { /* dev > si->dev */
+ per_dev->offset = si->obj_offset - si->unit_off;
+ cur_len = stripe_unit;
+ }
+
+ if (max_comp < dev)
+ max_comp = dev;
+ } else {
+ cur_len = stripe_unit;
+ }
+ if (cur_len >= length)
+ cur_len = length;
+
+ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+ cur_len, gfp_flags);
+ if (unlikely(ret))
+ goto out;
+
+ dev += mirrors_p1;
+ dev = (dev % devs_in_group) + first_dev;
+
+ length -= cur_len;
+ ios->length += cur_len;
+ }
+out:
+ ios->numdevs = max_comp + mirrors_p1;
+ *last_pg = cur_pg;
+ return ret;
+}
+
+static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
+{
+ u64 length = ios->ol_state.count;
+ u64 offset = ios->ol_state.offset;
+ struct _striping_info si;
+ unsigned last_pg = 0;
+ int ret = 0;
+
+ while (length) {
+ _calc_stripe_info(ios, offset, &si);
+
+ if (length < si.group_length)
+ si.group_length = length;
+
+ ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
+ if (unlikely(ret))
+ goto out;
+
+ offset += si.group_length;
+ length -= si.group_length;
+ }
+
+out:
+ if (!ios->length)
+ return ret;
+
+ return 0;
+}
+
+static ssize_t _sync_done(struct objio_state *ios)
+{
+ struct completion *waiting = ios->private;
+
+ complete(waiting);
+ return 0;
+}
+
+static void _last_io(struct kref *kref)
+{
+ struct objio_state *ios = container_of(kref, struct objio_state, kref);
+
+ ios->done(ios);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+ struct objio_state *ios = p;
+
+ kref_put(&ios->kref, _last_io);
+}
+
+static ssize_t _io_exec(struct objio_state *ios)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ ssize_t status = 0; /* sync status */
+ unsigned i;
+ objio_done_fn saved_done_fn = ios->done;
+ bool sync = ios->ol_state.sync;
+
+ if (sync) {
+ ios->done = _sync_done;
+ ios->private = &wait;
+ }
+
+ kref_init(&ios->kref);
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_request *or = ios->per_dev[i].or;
+
+ if (!or)
+ continue;
+
+ kref_get(&ios->kref);
+ osd_execute_request_async(or, _done_io, ios);
+ }
+
+ kref_put(&ios->kref, _last_io);
+
+ if (sync) {
+ wait_for_completion(&wait);
+ status = saved_done_fn(ios);
+ }
+
+ return status;
+}
+
+/*
+ * read
+ */
+static ssize_t _read_done(struct objio_state *ios)
+{
+ ssize_t status;
+ int ret = _io_check(ios, false);
+
+ _io_free(ios);
+
+ if (likely(!ret))
+ status = ios->length;
+ else
+ status = ret;
+
+ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+ return status;
+}
+
+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+ struct osd_request *or = NULL;
+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+ unsigned dev = per_dev->dev;
+ struct pnfs_osd_object_cred *cred =
+ &ios->layout->comps[dev];
+ struct osd_obj_id obj = {
+ .partition = cred->oc_object_id.oid_partition_id,
+ .id = cred->oc_object_id.oid_object_id,
+ };
+ int ret;
+
+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+ if (unlikely(!or)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ per_dev->or = or;
+
+ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
+
+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ if (ret) {
+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
+ __func__, ret);
+ goto err;
+ }
+
+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+ per_dev->length);
+
+err:
+ return ret;
+}
+
+static ssize_t _read_exec(struct objio_state *ios)
+{
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+ if (!ios->per_dev[i].length)
+ continue;
+ ret = _read_mirrors(ios, i);
+ if (unlikely(ret))
+ goto err;
+ }
+
+ ios->done = _read_done;
+ return _io_exec(ios); /* In sync mode exec returns the io status */
+
+err:
+ _io_free(ios);
+ return ret;
+}
+
+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+ int ret;
+
+ ret = _io_rw_pagelist(ios, GFP_KERNEL);
+ if (unlikely(ret))
+ return ret;
+
+ return _read_exec(ios);
+}
+
+/*
+ * write
+ */
+static ssize_t _write_done(struct objio_state *ios)
+{
+ ssize_t status;
+ int ret = _io_check(ios, true);
+
+ _io_free(ios);
+
+ if (likely(!ret)) {
+ /* FIXME: should be based on the OSD's persistence model
+ * See OSD2r05 Section 4.13 Data persistence model */
+ ios->ol_state.committed = NFS_FILE_SYNC;
+ status = ios->length;
+ } else {
+ status = ret;
+ }
+
+ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+ return status;
+}
+
+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+ unsigned dev = ios->per_dev[cur_comp].dev;
+ unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+ int ret;
+
+ for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+ struct osd_request *or = NULL;
+ struct pnfs_osd_object_cred *cred =
+ &ios->layout->comps[dev];
+ struct osd_obj_id obj = {
+ .partition = cred->oc_object_id.oid_partition_id,
+ .id = cred->oc_object_id.oid_object_id,
+ };
+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+ struct bio *bio;
+
+ or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
+ if (unlikely(!or)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ per_dev->or = or;
+
+ if (per_dev != master_dev) {
+ bio = bio_kmalloc(GFP_NOFS,
+ master_dev->bio->bi_max_vecs);
+ if (unlikely(!bio)) {
+ dprintk("Faild to allocate BIO size=%u\n",
+ master_dev->bio->bi_max_vecs);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ __bio_clone(bio, master_dev->bio);
+ bio->bi_bdev = NULL;
+ bio->bi_next = NULL;
+ per_dev->bio = bio;
+ per_dev->dev = dev;
+ per_dev->length = master_dev->length;
+ per_dev->offset = master_dev->offset;
+ } else {
+ bio = master_dev->bio;
+ bio->bi_rw |= REQ_WRITE;
+ }
+
+ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
+
+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ if (ret) {
+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
+ __func__, ret);
+ goto err;
+ }
+
+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+ per_dev->length);
+ }
+
+err:
+ return ret;
+}
+
+static ssize_t _write_exec(struct objio_state *ios)
+{
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+ if (!ios->per_dev[i].length)
+ continue;
+ ret = _write_mirrors(ios, i);
+ if (unlikely(ret))
+ goto err;
+ }
+
+ ios->done = _write_done;
+ return _io_exec(ios); /* In sync mode exec returns the io->status */
+
+err:
+ _io_free(ios);
+ return ret;
+}
+
+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+ int ret;
+
+ /* TODO: ios->stable = stable; */
+ ret = _io_rw_pagelist(ios, GFP_NOFS);
+ if (unlikely(ret))
+ return ret;
+
+ return _write_exec(ios);
+}
+
+static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req)
+{
+ if (!pnfs_generic_pg_test(pgio, prev, req))
+ return false;
+
+ return pgio->pg_count + req->wb_bytes <=
+ OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+}
+
+static struct pnfs_layoutdriver_type objlayout_type = {
+ .id = LAYOUT_OSD2_OBJECTS,
+ .name = "LAYOUT_OSD2_OBJECTS",
+ .flags = PNFS_LAYOUTRET_ON_SETATTR,
+
+ .alloc_layout_hdr = objlayout_alloc_layout_hdr,
+ .free_layout_hdr = objlayout_free_layout_hdr,
+
+ .alloc_lseg = objlayout_alloc_lseg,
+ .free_lseg = objlayout_free_lseg,
+
+ .read_pagelist = objlayout_read_pagelist,
+ .write_pagelist = objlayout_write_pagelist,
+ .pg_test = objio_pg_test,
+
+ .free_deviceid_node = objio_free_deviceid_node,
+
+ .encode_layoutcommit = objlayout_encode_layoutcommit,
+ .encode_layoutreturn = objlayout_encode_layoutreturn,
+};
+
+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
+MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
+MODULE_LICENSE("GPL");
+
+static int __init
+objlayout_init(void)
+{
+ int ret = pnfs_register_layoutdriver(&objlayout_type);
+
+ if (ret)
+ printk(KERN_INFO
+ "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+ __func__, ret);
+ else
+ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+ __func__);
+ return ret;
+}
+
+static void __exit
+objlayout_exit(void)
+{
+ pnfs_unregister_layoutdriver(&objlayout_type);
+ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+ __func__);
+}
+
+module_init(objlayout_init);
+module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 00000000000..dc3956c0de8
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,712 @@
+/*
+ * pNFS Objects layout driver high level definitions
+ *
+ * Copyright (C) 2007 Panasas Inc. [year of first publication]
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <scsi/osd_initiator.h>
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+/*
+ * Create a objlayout layout structure for the given inode and return it.
+ */
+struct pnfs_layout_hdr *
+objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+ struct objlayout *objlay;
+
+ objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
+ if (objlay) {
+ spin_lock_init(&objlay->lock);
+ INIT_LIST_HEAD(&objlay->err_list);
+ }
+ dprintk("%s: Return %p\n", __func__, objlay);
+ return &objlay->pnfs_layout;
+}
+
+/*
+ * Free an objlayout layout structure
+ */
+void
+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct objlayout *objlay = OBJLAYOUT(lo);
+
+ dprintk("%s: objlay %p\n", __func__, objlay);
+
+ WARN_ON(!list_empty(&objlay->err_list));
+ kfree(objlay);
+}
+
+/*
+ * Unmarshall layout and store it in pnfslay.
+ */
+struct pnfs_layout_segment *
+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ int status = -ENOMEM;
+ struct xdr_stream stream;
+ struct xdr_buf buf = {
+ .pages = lgr->layoutp->pages,
+ .page_len = lgr->layoutp->len,
+ .buflen = lgr->layoutp->len,
+ .len = lgr->layoutp->len,
+ };
+ struct page *scratch;
+ struct pnfs_layout_segment *lseg;
+
+ dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
+
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ goto err_nofree;
+
+ xdr_init_decode(&stream, &buf, NULL);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
+ if (unlikely(status)) {
+ dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
+ status);
+ goto err;
+ }
+
+ __free_page(scratch);
+
+ dprintk("%s: Return %p\n", __func__, lseg);
+ return lseg;
+
+err:
+ __free_page(scratch);
+err_nofree:
+ dprintk("%s: Err Return=>%d\n", __func__, status);
+ return ERR_PTR(status);
+}
+
+/*
+ * Free a layout segement
+ */
+void
+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+
+ if (unlikely(!lseg))
+ return;
+
+ objio_free_lseg(lseg);
+}
+
+/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+static struct objlayout_io_state *
+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+ struct page **pages,
+ unsigned pgbase,
+ loff_t offset,
+ size_t count,
+ struct pnfs_layout_segment *lseg,
+ void *rpcdata,
+ gfp_t gfp_flags)
+{
+ struct objlayout_io_state *state;
+ u64 lseg_end_offset;
+
+ dprintk("%s: allocating io_state\n", __func__);
+ if (objio_alloc_io_state(lseg, &state, gfp_flags))
+ return NULL;
+
+ BUG_ON(offset < lseg->pls_range.offset);
+ lseg_end_offset = end_offset(lseg->pls_range.offset,
+ lseg->pls_range.length);
+ BUG_ON(offset >= lseg_end_offset);
+ if (offset + count > lseg_end_offset) {
+ count = lseg->pls_range.length -
+ (offset - lseg->pls_range.offset);
+ dprintk("%s: truncated count %Zd\n", __func__, count);
+ }
+
+ if (pgbase > PAGE_SIZE) {
+ pages += pgbase >> PAGE_SHIFT;
+ pgbase &= ~PAGE_MASK;
+ }
+
+ INIT_LIST_HEAD(&state->err_list);
+ state->lseg = lseg;
+ state->rpcdata = rpcdata;
+ state->pages = pages;
+ state->pgbase = pgbase;
+ state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ state->offset = offset;
+ state->count = count;
+ state->sync = 0;
+
+ return state;
+}
+
+static void
+objlayout_free_io_state(struct objlayout_io_state *state)
+{
+ dprintk("%s: freeing io_state\n", __func__);
+ if (unlikely(!state))
+ return;
+
+ objio_free_io_state(state);
+}
+
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_state *state)
+{
+ dprintk("%s: state %p status\n", __func__, state);
+
+ if (likely(state->status >= 0)) {
+ objlayout_free_io_state(state);
+ } else {
+ struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+ spin_lock(&objlay->lock);
+ objlay->delta_space_valid = OBJ_DSU_INVALID;
+ list_add(&objlay->err_list, &state->err_list);
+ spin_unlock(&objlay->lock);
+ }
+}
+
+/*
+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
+ *
+ * The @index component IO failed (error returned from target). Register
+ * the error for later reporting at layout-return.
+ */
+void
+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+ struct pnfs_osd_objid *pooid, int osd_error,
+ u64 offset, u64 length, bool is_write)
+{
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+
+ BUG_ON(index >= state->num_comps);
+ if (osd_error) {
+ ioerr->oer_component = *pooid;
+ ioerr->oer_comp_offset = offset;
+ ioerr->oer_comp_length = length;
+ ioerr->oer_iswrite = is_write;
+ ioerr->oer_errno = osd_error;
+
+ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
+ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
+ __func__, index, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+ } else {
+ /* User need not call if no error is reported */
+ ioerr->oer_errno = 0;
+ }
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_read_data *rdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ rdata = container_of(task, struct nfs_read_data, task);
+
+ pnfs_ld_read_done(rdata);
+}
+
+void
+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+{
+ int eof = state->eof;
+ struct nfs_read_data *rdata;
+
+ state->status = status;
+ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
+ rdata = state->rpcdata;
+ rdata->task.tk_status = status;
+ if (status >= 0) {
+ rdata->res.count = status;
+ rdata->res.eof = eof;
+ }
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_ld_read_done(rdata);
+ else {
+ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+ schedule_work(&rdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_read_data *rdata)
+{
+ loff_t offset = rdata->args.offset;
+ size_t count = rdata->args.count;
+ struct objlayout_io_state *state;
+ ssize_t status = 0;
+ loff_t eof;
+
+ dprintk("%s: Begin inode %p offset %llu count %d\n",
+ __func__, rdata->inode, offset, (int)count);
+
+ eof = i_size_read(rdata->inode);
+ if (unlikely(offset + count > eof)) {
+ if (offset >= eof) {
+ status = 0;
+ rdata->res.count = 0;
+ rdata->res.eof = 1;
+ goto out;
+ }
+ count = eof - offset;
+ }
+
+ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+ rdata->args.pages, rdata->args.pgbase,
+ offset, count,
+ rdata->lseg, rdata,
+ GFP_KERNEL);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->eof = state->offset + state->count >= eof;
+
+ status = objio_read_pagelist(state);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ rdata->pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+
+ pnfs_ld_write_done(wdata);
+}
+
+void
+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+ bool sync)
+{
+ struct nfs_write_data *wdata;
+
+ dprintk("%s: Begin\n", __func__);
+ wdata = state->rpcdata;
+ state->status = status;
+ wdata->task.tk_status = status;
+ if (status >= 0) {
+ wdata->res.count = status;
+ wdata->verf.committed = state->committed;
+ dprintk("%s: Return status %d committed %d\n",
+ __func__, wdata->task.tk_status,
+ wdata->verf.committed);
+ } else
+ dprintk("%s: Return status %d\n",
+ __func__, wdata->task.tk_status);
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_ld_write_done(wdata);
+ else {
+ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+ schedule_work(&wdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_write_data *wdata,
+ int how)
+{
+ struct objlayout_io_state *state;
+ ssize_t status;
+
+ dprintk("%s: Begin inode %p offset %llu count %u\n",
+ __func__, wdata->inode, wdata->args.offset, wdata->args.count);
+
+ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
+ wdata->args.pages,
+ wdata->args.pgbase,
+ wdata->args.offset,
+ wdata->args.count,
+ wdata->lseg, wdata,
+ GFP_NOFS);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->sync = how & FLUSH_SYNC;
+
+ status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ wdata->pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+void
+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *args)
+{
+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
+ struct pnfs_osd_layoutupdate lou;
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+
+ spin_lock(&objlay->lock);
+ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
+ lou.dsu_delta = objlay->delta_space_used;
+ objlay->delta_space_used = 0;
+ objlay->delta_space_valid = OBJ_DSU_INIT;
+ lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
+ spin_unlock(&objlay->lock);
+
+ start = xdr_reserve_space(xdr, 4);
+
+ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+
+ dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
+ lou.dsu_delta, lou.olu_ioerr_flag);
+}
+
+static int
+err_prio(u32 oer_errno)
+{
+ switch (oer_errno) {
+ case 0:
+ return 0;
+
+ case PNFS_OSD_ERR_RESOURCE:
+ return OSD_ERR_PRI_RESOURCE;
+ case PNFS_OSD_ERR_BAD_CRED:
+ return OSD_ERR_PRI_BAD_CRED;
+ case PNFS_OSD_ERR_NO_ACCESS:
+ return OSD_ERR_PRI_NO_ACCESS;
+ case PNFS_OSD_ERR_UNREACHABLE:
+ return OSD_ERR_PRI_UNREACHABLE;
+ case PNFS_OSD_ERR_NOT_FOUND:
+ return OSD_ERR_PRI_NOT_FOUND;
+ case PNFS_OSD_ERR_NO_SPACE:
+ return OSD_ERR_PRI_NO_SPACE;
+ default:
+ WARN_ON(1);
+ /* fallthrough */
+ case PNFS_OSD_ERR_EIO:
+ return OSD_ERR_PRI_EIO;
+ }
+}
+
+static void
+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
+ const struct pnfs_osd_ioerr *src_err)
+{
+ u64 dest_end, src_end;
+
+ if (!dest_err->oer_errno) {
+ *dest_err = *src_err;
+ /* accumulated device must be blank */
+ memset(&dest_err->oer_component.oid_device_id, 0,
+ sizeof(dest_err->oer_component.oid_device_id));
+
+ return;
+ }
+
+ if (dest_err->oer_component.oid_partition_id !=
+ src_err->oer_component.oid_partition_id)
+ dest_err->oer_component.oid_partition_id = 0;
+
+ if (dest_err->oer_component.oid_object_id !=
+ src_err->oer_component.oid_object_id)
+ dest_err->oer_component.oid_object_id = 0;
+
+ if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
+ dest_err->oer_comp_offset = src_err->oer_comp_offset;
+
+ dest_end = end_offset(dest_err->oer_comp_offset,
+ dest_err->oer_comp_length);
+ src_end = end_offset(src_err->oer_comp_offset,
+ src_err->oer_comp_length);
+ if (dest_end < src_end)
+ dest_end = src_end;
+
+ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+
+ if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
+ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
+ dest_err->oer_errno = src_err->oer_errno;
+ } else if (src_err->oer_iswrite) {
+ dest_err->oer_iswrite = true;
+ dest_err->oer_errno = src_err->oer_errno;
+ }
+}
+
+static void
+encode_accumulated_error(struct objlayout *objlay, __be32 *p)
+{
+ struct objlayout_io_state *state, *tmp;
+ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+
+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ unsigned i;
+
+ for (i = 0; i < state->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+ if (!ioerr->oer_errno)
+ continue;
+
+ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "offset=0x%llx length=0x%llx\n",
+ __func__, i, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+
+ merge_ioerr(&accumulated_err, ioerr);
+ }
+ list_del(&state->err_list);
+ objlayout_free_io_state(state);
+ }
+
+ pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
+}
+
+void
+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
+ struct objlayout_io_state *state, *tmp;
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+ start = xdr_reserve_space(xdr, 4);
+ BUG_ON(!start);
+
+ spin_lock(&objlay->lock);
+
+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ __be32 *last_xdr = NULL, *p;
+ unsigned i;
+ int res = 0;
+
+ for (i = 0; i < state->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+ if (!ioerr->oer_errno)
+ continue;
+
+ dprintk("%s: err[%d]: errno=%d is_write=%d "
+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "offset=0x%llx length=0x%llx\n",
+ __func__, i, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+
+ p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
+ if (unlikely(!p)) {
+ res = -E2BIG;
+ break; /* accumulated_error */
+ }
+
+ last_xdr = p;
+ pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+ }
+
+ /* TODO: use xdr_write_pages */
+ if (unlikely(res)) {
+ /* no space for even one error descriptor */
+ BUG_ON(!last_xdr);
+
+ /* we've encountered a situation with lots and lots of
+ * errors and no space to encode them all. Use the last
+ * available slot to report the union of all the
+ * remaining errors.
+ */
+ encode_accumulated_error(objlay, last_xdr);
+ goto loop_done;
+ }
+ list_del(&state->err_list);
+ objlayout_free_io_state(state);
+ }
+loop_done:
+ spin_unlock(&objlay->lock);
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+ dprintk("%s: Return\n", __func__);
+}
+
+
+/*
+ * Get Device Info API for io engines
+ */
+struct objlayout_deviceinfo {
+ struct page *page;
+ struct pnfs_osd_deviceaddr da; /* This must be last */
+};
+
+/* Initialize and call nfs_getdeviceinfo, then decode and return a
+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
+ * should be called.
+ */
+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+ gfp_t gfp_flags)
+{
+ struct objlayout_deviceinfo *odi;
+ struct pnfs_device pd;
+ struct super_block *sb;
+ struct page *page, **pages;
+ u32 *p;
+ int err;
+
+ page = alloc_page(gfp_flags);
+ if (!page)
+ return -ENOMEM;
+
+ pages = &page;
+ pd.pages = pages;
+
+ memcpy(&pd.dev_id, d_id, sizeof(*d_id));
+ pd.layout_type = LAYOUT_OSD2_OBJECTS;
+ pd.pages = &page;
+ pd.pgbase = 0;
+ pd.pglen = PAGE_SIZE;
+ pd.mincount = 0;
+
+ sb = pnfslay->plh_inode->i_sb;
+ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
+ if (err)
+ goto err_out;
+
+ p = page_address(page);
+ odi = kzalloc(sizeof(*odi), gfp_flags);
+ if (!odi) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
+ odi->page = page;
+ *deviceaddr = &odi->da;
+ return 0;
+
+err_out:
+ __free_page(page);
+ return err;
+}
+
+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+ struct objlayout_deviceinfo *odi = container_of(deviceaddr,
+ struct objlayout_deviceinfo,
+ da);
+
+ __free_page(odi->page);
+ kfree(odi);
+}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 00000000000..a8244c8e042
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,187 @@
+/*
+ * Data types and function declerations for interfacing with the
+ * pNFS standard object layout driver.
+ *
+ * Copyright (C) 2007 Panasas Inc. [year of first publication]
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OBJLAYOUT_H
+#define _OBJLAYOUT_H
+
+#include <linux/nfs_fs.h>
+#include <linux/pnfs_osd_xdr.h>
+#include "../pnfs.h"
+
+/*
+ * per-inode layout
+ */
+struct objlayout {
+ struct pnfs_layout_hdr pnfs_layout;
+
+ /* for layout_commit */
+ enum osd_delta_space_valid_enum {
+ OBJ_DSU_INIT = 0,
+ OBJ_DSU_VALID,
+ OBJ_DSU_INVALID,
+ } delta_space_valid;
+ s64 delta_space_used; /* consumed by write ops */
+
+ /* for layout_return */
+ spinlock_t lock;
+ struct list_head err_list;
+};
+
+static inline struct objlayout *
+OBJLAYOUT(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct objlayout, pnfs_layout);
+}
+
+/*
+ * per-I/O operation state
+ * embedded in objects provider io_state data structure
+ */
+struct objlayout_io_state {
+ struct pnfs_layout_segment *lseg;
+
+ struct page **pages;
+ unsigned pgbase;
+ unsigned nr_pages;
+ unsigned long count;
+ loff_t offset;
+ bool sync;
+
+ void *rpcdata;
+ int status; /* res */
+ int eof; /* res */
+ int committed; /* res */
+
+ /* Error reporting (layout_return) */
+ struct list_head err_list;
+ unsigned num_comps;
+ /* Pointer to array of error descriptors of size num_comps.
+ * It should contain as many entries as devices in the osd_layout
+ * that participate in the I/O. It is up to the io_engine to allocate
+ * needed space and set num_comps.
+ */
+ struct pnfs_osd_ioerr *ioerrs;
+};
+
+/*
+ * Raid engine I/O API
+ */
+extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+ struct pnfs_layout_hdr *pnfslay,
+ struct pnfs_layout_range *range,
+ struct xdr_stream *xdr,
+ gfp_t gfp_flags);
+extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
+
+extern int objio_alloc_io_state(
+ struct pnfs_layout_segment *lseg,
+ struct objlayout_io_state **outp,
+ gfp_t gfp_flags);
+extern void objio_free_io_state(struct objlayout_io_state *state);
+
+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+ bool stable);
+
+/*
+ * callback API
+ */
+extern void objlayout_io_set_result(struct objlayout_io_state *state,
+ unsigned index, struct pnfs_osd_objid *pooid,
+ int osd_error, u64 offset, u64 length, bool is_write);
+
+static inline void
+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+{
+ struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+ /* If one of the I/Os errored out and the delta_space_used was
+ * invalid we render the complete report as invalid. Protocol mandate
+ * the DSU be accurate or not reported.
+ */
+ spin_lock(&objlay->lock);
+ if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
+ objlay->delta_space_valid = OBJ_DSU_VALID;
+ objlay->delta_space_used += space_used;
+ }
+ spin_unlock(&objlay->lock);
+}
+
+extern void objlayout_read_done(struct objlayout_io_state *state,
+ ssize_t status, bool sync);
+extern void objlayout_write_done(struct objlayout_io_state *state,
+ ssize_t status, bool sync);
+
+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+ gfp_t gfp_flags);
+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
+
+/*
+ * exported generic objects function vectors
+ */
+
+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
+
+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
+ struct pnfs_layout_hdr *,
+ struct nfs4_layoutget_res *,
+ gfp_t gfp_flags);
+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
+
+extern enum pnfs_try_status objlayout_read_pagelist(
+ struct nfs_read_data *);
+
+extern enum pnfs_try_status objlayout_write_pagelist(
+ struct nfs_write_data *,
+ int how);
+
+extern void objlayout_encode_layoutcommit(
+ struct pnfs_layout_hdr *,
+ struct xdr_stream *,
+ const struct nfs4_layoutcommit_args *);
+
+extern void objlayout_encode_layoutreturn(
+ struct pnfs_layout_hdr *,
+ struct xdr_stream *,
+ const struct nfs4_layoutreturn_args *);
+
+#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 00000000000..16fc758e912
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,412 @@
+/*
+ * Object-Based pNFS Layout XDR layer
+ *
+ * Copyright (C) 2007 Panasas Inc. [year of first publication]
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/pnfs_osd_xdr.h>
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+/*
+ * The following implementation is based on RFC5664
+ */
+
+/*
+ * struct pnfs_osd_objid {
+ * struct nfs4_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static __be32 *
+_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
+{
+ p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
+ sizeof(objid->oid_device_id.data));
+
+ p = xdr_decode_hyper(p, &objid->oid_partition_id);
+ p = xdr_decode_hyper(p, &objid->oid_object_id);
+ return p;
+}
+/*
+ * struct pnfs_osd_opaque_cred {
+ * u32 cred_len;
+ * void *cred;
+ * }; // xdr size [variable]
+ * The return pointers are from the xdr buffer
+ */
+static int
+_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
+ struct xdr_stream *xdr)
+{
+ __be32 *p = xdr_inline_decode(xdr, 1);
+
+ if (!p)
+ return -EINVAL;
+
+ opaque_cred->cred_len = be32_to_cpu(*p++);
+
+ p = xdr_inline_decode(xdr, opaque_cred->cred_len);
+ if (!p)
+ return -EINVAL;
+
+ opaque_cred->cred = p;
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_object_cred {
+ * struct pnfs_osd_objid oc_object_id;
+ * u32 oc_osd_version;
+ * u32 oc_cap_key_sec;
+ * struct pnfs_osd_opaque_cred oc_cap_key
+ * struct pnfs_osd_opaque_cred oc_cap;
+ * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
+ */
+static int
+_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
+ struct xdr_stream *xdr)
+{
+ __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
+ int ret;
+
+ if (!p)
+ return -EIO;
+
+ p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+ comp->oc_osd_version = be32_to_cpup(p++);
+ comp->oc_cap_key_sec = be32_to_cpup(p);
+
+ ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
+ if (unlikely(ret))
+ return ret;
+
+ ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
+ return ret;
+}
+
+/*
+ * struct pnfs_osd_data_map {
+ * u32 odm_num_comps;
+ * u64 odm_stripe_unit;
+ * u32 odm_group_width;
+ * u32 odm_group_depth;
+ * u32 odm_mirror_cnt;
+ * u32 odm_raid_algorithm;
+ * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
+ */
+static inline int
+_osd_data_map_xdr_sz(void)
+{
+ return 4 + 8 + 4 + 4 + 4 + 4;
+}
+
+static __be32 *
+_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
+{
+ data_map->odm_num_comps = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
+ data_map->odm_group_width = be32_to_cpup(p++);
+ data_map->odm_group_depth = be32_to_cpup(p++);
+ data_map->odm_mirror_cnt = be32_to_cpup(p++);
+ data_map->odm_raid_algorithm = be32_to_cpup(p++);
+ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
+ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
+ __func__,
+ data_map->odm_num_comps,
+ (unsigned long long)data_map->odm_stripe_unit,
+ data_map->odm_group_width,
+ data_map->odm_group_depth,
+ data_map->odm_mirror_cnt,
+ data_map->odm_raid_algorithm);
+ return p;
+}
+
+int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
+ struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ memset(iter, 0, sizeof(*iter));
+
+ p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
+ if (unlikely(!p))
+ return -EINVAL;
+
+ p = _osd_xdr_decode_data_map(p, &layout->olo_map);
+ layout->olo_comps_index = be32_to_cpup(p++);
+ layout->olo_num_comps = be32_to_cpup(p++);
+ iter->total_comps = layout->olo_num_comps;
+ return 0;
+}
+
+bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
+ struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
+ int *err)
+{
+ BUG_ON(iter->decoded_comps > iter->total_comps);
+ if (iter->decoded_comps == iter->total_comps)
+ return false;
+
+ *err = _osd_xdr_decode_object_cred(comp, xdr);
+ if (unlikely(*err)) {
+ dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
+ "total_comps=%d\n", __func__, *err,
+ iter->decoded_comps, iter->total_comps);
+ return false; /* stop the loop */
+ }
+ dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "key_len=%u cap_len=%u\n",
+ __func__,
+ _DEVID_LO(&comp->oc_object_id.oid_device_id),
+ _DEVID_HI(&comp->oc_object_id.oid_device_id),
+ comp->oc_object_id.oid_partition_id,
+ comp->oc_object_id.oid_object_id,
+ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
+
+ iter->decoded_comps++;
+ return true;
+}
+
+/*
+ * Get Device Information Decoding
+ *
+ * Note: since Device Information is currently done synchronously, all
+ * variable strings fields are left inside the rpc buffer and are only
+ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
+ * should not be freed while the returned information is in use.
+ */
+/*
+ *struct nfs4_string {
+ * unsigned int len;
+ * char *data;
+ *}; // size [variable]
+ * NOTE: Returned string points to inside the XDR buffer
+ */
+static __be32 *
+__read_u8_opaque(__be32 *p, struct nfs4_string *str)
+{
+ str->len = be32_to_cpup(p++);
+ str->data = (char *)p;
+
+ p += XDR_QUADLEN(str->len);
+ return p;
+}
+
+/*
+ * struct pnfs_osd_targetid {
+ * u32 oti_type;
+ * struct nfs4_string oti_scsi_device_id;
+ * };// size 4 + [variable]
+ */
+static __be32 *
+__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
+{
+ u32 oti_type;
+
+ oti_type = be32_to_cpup(p++);
+ targetid->oti_type = oti_type;
+
+ switch (oti_type) {
+ case OBJ_TARGET_SCSI_NAME:
+ case OBJ_TARGET_SCSI_DEVICE_ID:
+ p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
+ }
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_net_addr {
+ * struct nfs4_string r_netid;
+ * struct nfs4_string r_addr;
+ * };
+ */
+static __be32 *
+__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
+{
+ p = __read_u8_opaque(p, &netaddr->r_netid);
+ p = __read_u8_opaque(p, &netaddr->r_addr);
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_targetaddr {
+ * u32 ota_available;
+ * struct pnfs_osd_net_addr ota_netaddr;
+ * };
+ */
+static __be32 *
+__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
+{
+ u32 ota_available;
+
+ ota_available = be32_to_cpup(p++);
+ targetaddr->ota_available = ota_available;
+
+ if (ota_available)
+ p = __read_net_addr(p, &targetaddr->ota_netaddr);
+
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_deviceaddr {
+ * struct pnfs_osd_targetid oda_targetid;
+ * struct pnfs_osd_targetaddr oda_targetaddr;
+ * u8 oda_lun[8];
+ * struct nfs4_string oda_systemid;
+ * struct pnfs_osd_object_cred oda_root_obj_cred;
+ * struct nfs4_string oda_osdname;
+ * };
+ */
+
+/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
+ * not have an xdr_stream
+ */
+static __be32 *
+__read_opaque_cred(__be32 *p,
+ struct pnfs_osd_opaque_cred *opaque_cred)
+{
+ opaque_cred->cred_len = be32_to_cpu(*p++);
+ opaque_cred->cred = p;
+ return p + XDR_QUADLEN(opaque_cred->cred_len);
+}
+
+static __be32 *
+__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
+{
+ p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+ comp->oc_osd_version = be32_to_cpup(p++);
+ comp->oc_cap_key_sec = be32_to_cpup(p++);
+
+ p = __read_opaque_cred(p, &comp->oc_cap_key);
+ p = __read_opaque_cred(p, &comp->oc_cap);
+ return p;
+}
+
+void pnfs_osd_xdr_decode_deviceaddr(
+ struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
+{
+ p = __read_targetid(p, &deviceaddr->oda_targetid);
+
+ p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
+
+ p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
+ sizeof(deviceaddr->oda_lun));
+
+ p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
+
+ p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
+
+ p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
+
+ /* libosd likes this terminated in dbg. It's last, so no problems */
+ deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
+}
+
+/*
+ * struct pnfs_osd_layoutupdate {
+ * u32 dsu_valid;
+ * s64 dsu_delta;
+ * u32 olu_ioerr_flag;
+ * }; xdr size 4 + 8 + 4
+ */
+int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+ struct pnfs_osd_layoutupdate *lou)
+{
+ __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4);
+
+ if (!p)
+ return -E2BIG;
+
+ *p++ = cpu_to_be32(lou->dsu_valid);
+ if (lou->dsu_valid)
+ p = xdr_encode_hyper(p, lou->dsu_delta);
+ *p++ = cpu_to_be32(lou->olu_ioerr_flag);
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_objid {
+ * struct nfs4_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static inline __be32 *
+pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
+{
+ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
+ sizeof(object_id->oid_device_id.data));
+ p = xdr_encode_hyper(p, object_id->oid_partition_id);
+ p = xdr_encode_hyper(p, object_id->oid_object_id);
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_ioerr {
+ * struct pnfs_osd_objid oer_component;
+ * u64 oer_comp_offset;
+ * u64 oer_comp_length;
+ * u32 oer_iswrite;
+ * u32 oer_errno;
+ * }; // xdr size 32 + 24 bytes
+ */
+void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
+{
+ p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
+ p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
+ p = xdr_encode_hyper(p, ioerr->oer_comp_length);
+ *p++ = cpu_to_be32(ioerr->oer_iswrite);
+ *p = cpu_to_be32(ioerr->oer_errno);
+}
+
+__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 32 + 24);
+ if (unlikely(!p))
+ dprintk("%s: out of xdr space\n", __func__);
+
+ return p;
+}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c80add6e221..7913961aff2 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req)
TASK_UNINTERRUPTIBLE);
}
+static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+{
+ /*
+ * FIXME: ideally we should be able to coalesce all requests
+ * that are not block boundary aligned, but currently this
+ * is problematic for the case of bsize < PAGE_CACHE_SIZE,
+ * since nfs_flush_multi and nfs_pagein_multi assume you
+ * can have only one struct nfs_page.
+ */
+ if (desc->pg_bsize < PAGE_SIZE)
+ return 0;
+
+ return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+}
+
/**
* nfs_pageio_init - initialise a page io descriptor
* @desc: pointer to descriptor
@@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_ioflags = io_flags;
desc->pg_error = 0;
desc->pg_lseg = NULL;
+ desc->pg_test = nfs_generic_pg_test;
+ pnfs_pageio_init(desc, inode);
}
/**
@@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
*
* Return 'true' if this is the case, else return 'false'.
*/
-static int nfs_can_coalesce_requests(struct nfs_page *prev,
- struct nfs_page *req,
- struct nfs_pageio_descriptor *pgio)
+static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+ struct nfs_page *req,
+ struct nfs_pageio_descriptor *pgio)
{
if (req->wb_context->cred != prev->wb_context->cred)
- return 0;
+ return false;
if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
- return 0;
+ return false;
if (req->wb_context->state != prev->wb_context->state)
- return 0;
+ return false;
if (req->wb_index != (prev->wb_index + 1))
- return 0;
+ return false;
if (req->wb_pgbase != 0)
- return 0;
+ return false;
if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
- return 0;
- /*
- * Non-whole file layouts need to check that req is inside of
- * pgio->pg_lseg.
- */
- if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
- return 0;
- return 1;
+ return false;
+ return pgio->pg_test(pgio, prev, req);
}
/**
@@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
- size_t newlen = req->wb_bytes;
-
if (desc->pg_count != 0) {
struct nfs_page *prev;
- /*
- * FIXME: ideally we should be able to coalesce all requests
- * that are not block boundary aligned, but currently this
- * is problematic for the case of bsize < PAGE_CACHE_SIZE,
- * since nfs_flush_multi and nfs_pagein_multi assume you
- * can have only one struct nfs_page.
- */
- if (desc->pg_bsize < PAGE_SIZE)
- return 0;
- newlen += desc->pg_count;
- if (newlen > desc->pg_bsize)
- return 0;
prev = nfs_list_entry(desc->pg_list.prev);
if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
- } else
+ } else {
desc->pg_base = req->wb_pgbase;
+ }
nfs_list_remove_request(req);
nfs_list_add_request(req, &desc->pg_list);
- desc->pg_count = newlen;
+ desc->pg_count += req->wb_bytes;
return 1;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f57f5281a52..8c1309d852a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
atomic_inc(&lo->plh_refcount);
}
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
+ kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+}
+
static void
destroy_layout_hdr(struct pnfs_layout_hdr *lo)
{
dprintk("%s: freeing layout cache %p\n", __func__, lo);
BUG_ON(!list_empty(&lo->plh_layouts));
NFS_I(lo->plh_inode)->layout = NULL;
- kfree(lo);
+ pnfs_free_layout_hdr(lo);
}
static void
@@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
{
struct inode *inode = lseg->pls_layout->plh_inode;
- BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
list_del_init(&lseg->pls_list);
if (list_empty(&lseg->pls_layout->plh_segs)) {
set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
@@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg)
}
EXPORT_SYMBOL_GPL(put_lseg);
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+/*
+ * is l2 fully contained in l1?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline int
+lo_seg_contained(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = end_offset(start2, l2->length);
+
+ return (start1 <= start2) && (end1 >= end2);
+}
+
+/*
+ * is l1 and l2 intersecting?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline int
+lo_seg_intersecting(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = end_offset(start2, l2->length);
+
+ return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
+ (end2 == NFS4_MAX_UINT64 || end2 > start1);
+}
+
static bool
-should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
+should_free_lseg(struct pnfs_layout_range *lseg_range,
+ struct pnfs_layout_range *recall_range)
{
- return (recall_iomode == IOMODE_ANY ||
- lseg_iomode == recall_iomode);
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode) &&
+ lo_seg_intersecting(lseg_range, recall_range);
}
/* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
int
mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- u32 iomode)
+ struct pnfs_layout_range *recall_range)
{
struct pnfs_layout_segment *lseg, *next;
int invalid = 0, removed = 0;
@@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return 0;
}
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+ if (!recall_range ||
+ should_free_lseg(&lseg->pls_range, recall_range)) {
dprintk("%s: freeing lseg %p iomode %d "
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
@@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
lo = nfsi->layout;
if (lo) {
lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
- mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
+ mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
}
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
@@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
- u32 iomode,
+ struct pnfs_layout_range *range,
gfp_t gfp_flags)
{
struct inode *ino = lo->plh_inode;
@@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
goto out_err_free;
}
- lgp->args.minlength = NFS4_MAX_UINT64;
+ lgp->args.minlength = PAGE_CACHE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- lgp->args.range.iomode = iomode;
- lgp->args.range.offset = 0;
- lgp->args.range.length = NFS4_MAX_UINT64;
+ lgp->args.range = *range;
lgp->args.type = server->pnfs_curr_ld->id;
lgp->args.inode = ino;
lgp->args.ctx = get_nfs_open_context(ctx);
@@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
nfs4_proc_layoutget(lgp);
if (!lseg) {
/* remember that LAYOUTGET failed and suspend trying */
- set_bit(lo_fail_bit(iomode), &lo->plh_flags);
+ set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
}
/* free xdr pages */
@@ -542,6 +619,51 @@ out_err_free:
return NULL;
}
+/* Initiates a LAYOUTRETURN(FILE) */
+int
+_pnfs_return_layout(struct inode *ino)
+{
+ struct pnfs_layout_hdr *lo = NULL;
+ struct nfs_inode *nfsi = NFS_I(ino);
+ LIST_HEAD(tmp_list);
+ struct nfs4_layoutreturn *lrp;
+ nfs4_stateid stateid;
+ int status = 0;
+
+ dprintk("--> %s\n", __func__);
+
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
+ spin_unlock(&ino->i_lock);
+ dprintk("%s: no layout segments to return\n", __func__);
+ goto out;
+ }
+ stateid = nfsi->layout->plh_stateid;
+ /* Reference matched in nfs4_layoutreturn_release */
+ get_layout_hdr(lo);
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&tmp_list);
+
+ WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
+
+ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+ if (unlikely(lrp == NULL)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ lrp->args.stateid = stateid;
+ lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+ lrp->args.inode = ino;
+ lrp->clp = NFS_SERVER(ino)->nfs_client;
+
+ status = nfs4_proc_layoutreturn(lrp);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
bool pnfs_roc(struct inode *ino)
{
struct pnfs_layout_hdr *lo;
@@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
* are seen first.
*/
static s64
-cmp_layout(u32 iomode1, u32 iomode2)
+cmp_layout(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
{
+ s64 d;
+
+ /* high offset > low offset */
+ d = l1->offset - l2->offset;
+ if (d)
+ return d;
+
+ /* short length > long length */
+ d = l2->length - l1->length;
+ if (d)
+ return d;
+
/* read > read/write */
- return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+ return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
}
static void
@@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg)
{
struct pnfs_layout_segment *lp;
- int found = 0;
dprintk("%s:Begin\n", __func__);
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
+ if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
lseg->pls_range.offset, lseg->pls_range.length,
lp, lp->pls_range.iomode, lp->pls_range.offset,
lp->pls_range.length);
- found = 1;
- break;
- }
- if (!found) {
- list_add_tail(&lseg->pls_list, &lo->plh_segs);
- dprintk("%s: inserted lseg %p "
- "iomode %d offset %llu length %llu at tail\n",
- __func__, lseg, lseg->pls_range.iomode,
- lseg->pls_range.offset, lseg->pls_range.length);
+ goto out;
}
+ list_add_tail(&lseg->pls_list, &lo->plh_segs);
+ dprintk("%s: inserted lseg %p "
+ "iomode %d offset %llu length %llu at tail\n",
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length);
+out:
get_layout_hdr(lo);
dprintk("%s:Return\n", __func__);
@@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
{
struct pnfs_layout_hdr *lo;
- lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+ lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
if (!lo)
return NULL;
atomic_set(&lo->plh_refcount, 1);
@@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
if (likely(nfsi->layout == NULL)) /* Won the race? */
nfsi->layout = new;
else
- kfree(new);
+ pnfs_free_layout_hdr(new);
return nfsi->layout;
}
@@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
* READ RW true
*/
static int
-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+is_matching_lseg(struct pnfs_layout_range *ls_range,
+ struct pnfs_layout_range *range)
{
- return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
+ struct pnfs_layout_range range1;
+
+ if ((range->iomode == IOMODE_RW &&
+ ls_range->iomode != IOMODE_RW) ||
+ !lo_seg_intersecting(ls_range, range))
+ return 0;
+
+ /* range1 covers only the first byte in the range */
+ range1 = *range;
+ range1.length = 1;
+ return lo_seg_contained(ls_range, &range1);
}
/*
* lookup range in layout
*/
static struct pnfs_layout_segment *
-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
{
struct pnfs_layout_segment *lseg, *ret = NULL;
@@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
- is_matching_lseg(lseg, iomode)) {
+ is_matching_lseg(&lseg->pls_range, range)) {
ret = get_lseg(lseg);
break;
}
- if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
+ if (cmp_layout(range, &lseg->pls_range) > 0)
break;
}
@@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
enum pnfs_iomode iomode,
gfp_t gfp_flags)
{
+ struct pnfs_layout_range arg = {
+ .iomode = iomode,
+ .offset = pos,
+ .length = count,
+ };
+ unsigned pg_offset;
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo;
@@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;
/* Check to see if the layout for the given range already exists */
- lseg = pnfs_find_lseg(lo, iomode);
+ lseg = pnfs_find_lseg(lo, &arg);
if (lseg)
goto out_unlock;
@@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino,
spin_unlock(&clp->cl_lock);
}
- lseg = send_layoutget(lo, ctx, iomode, gfp_flags);
+ pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+ if (pg_offset) {
+ arg.offset -= pg_offset;
+ arg.length += pg_offset;
+ }
+ arg.length = PAGE_CACHE_ALIGN(arg.length);
+
+ lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
if (!lseg && first) {
spin_lock(&clp->cl_lock);
list_del_init(&lo->plh_layouts);
@@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
int status = 0;
- /* Verify we got what we asked for.
- * Note that because the xdr parsing only accepts a single
- * element array, this can fail even if the server is behaving
- * correctly.
- */
- if (lgp->args.range.iomode > res->range.iomode ||
- res->range.offset != 0 ||
- res->range.length != NFS4_MAX_UINT64) {
- status = -EINVAL;
- goto out;
- }
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
if (!lseg || IS_ERR(lseg)) {
@@ -895,51 +1043,64 @@ out_forget_reply:
goto out;
}
-static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *prev,
- struct nfs_page *req)
+bool
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
{
+ enum pnfs_iomode access_type;
+ gfp_t gfp_flags;
+
+ /* We assume that pg_ioflags == 0 iff we're reading a page */
+ if (pgio->pg_ioflags == 0) {
+ access_type = IOMODE_READ;
+ gfp_flags = GFP_KERNEL;
+ } else {
+ access_type = IOMODE_RW;
+ gfp_flags = GFP_NOFS;
+ }
+
if (pgio->pg_count == prev->wb_bytes) {
/* This is first coelesce call for a series of nfs_pages */
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
prev->wb_context,
- IOMODE_READ,
- GFP_KERNEL);
+ req_offset(req),
+ pgio->pg_count,
+ access_type,
+ gfp_flags);
+ return true;
}
- return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-}
-void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-{
- struct pnfs_layoutdriver_type *ld;
+ if (pgio->pg_lseg &&
+ req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length))
+ return false;
- ld = NFS_SERVER(inode)->pnfs_curr_ld;
- pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
+ return true;
}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
-static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *prev,
- struct nfs_page *req)
+/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_write_done(struct nfs_write_data *data)
{
- if (pgio->pg_count == prev->wb_bytes) {
- /* This is first coelesce call for a series of nfs_pages */
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- prev->wb_context,
- IOMODE_RW,
- GFP_NOFS);
- }
- return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-}
+ int status;
-void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-{
- struct pnfs_layoutdriver_type *ld;
+ if (!data->pnfs_error) {
+ pnfs_set_layoutcommit(data);
+ data->mds_ops->rpc_call_done(&data->task, data);
+ data->mds_ops->rpc_release(data);
+ return 0;
+ }
- ld = NFS_SERVER(inode)->pnfs_curr_ld;
- pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+ dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+ data->pnfs_error);
+ status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
+ data->mds_ops, NFS_FILE_SYNC);
+ return status ? : -EAGAIN;
}
+EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
enum pnfs_try_status
pnfs_try_to_write_data(struct nfs_write_data *wdata,
@@ -966,6 +1127,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
}
/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_read_done(struct nfs_read_data *data)
+{
+ int status;
+
+ if (!data->pnfs_error) {
+ __nfs4_read_done_cb(data);
+ data->mds_ops->rpc_call_done(&data->task, data);
+ data->mds_ops->rpc_release(data);
+ return 0;
+ }
+
+ dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+ data->pnfs_error);
+ status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
+ data->mds_ops);
+ return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+
+/*
* Call the appropriate parallel I/O subsystem read function.
*/
enum pnfs_try_status
@@ -1009,7 +1193,7 @@ void
pnfs_set_layoutcommit(struct nfs_write_data *wdata)
{
struct nfs_inode *nfsi = NFS_I(wdata->inode);
- loff_t end_pos = wdata->args.offset + wdata->res.count;
+ loff_t end_pos = wdata->mds_offset + wdata->res.count;
bool mark_as_dirty = false;
spin_lock(&nfsi->vfs_inode.i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0c015bad9e7..48d0a8e4d06 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,7 @@
#ifndef FS_NFS_PNFS_H
#define FS_NFS_PNFS_H
+#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
enum {
@@ -64,17 +65,29 @@ enum {
NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};
+enum layoutdriver_policy_flags {
+ /* Should the pNFS client commit and return the layout upon a setattr */
+ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
+};
+
+struct nfs4_deviceid_node;
+
/* Per-layout driver specific registration structure */
struct pnfs_layoutdriver_type {
struct list_head pnfs_tblid;
const u32 id;
const char *name;
struct module *owner;
+ unsigned flags;
+
+ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
+ void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
/* test for nfs page cache coalescing */
- int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+ bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
/* Returns true if layoutdriver wants to divert this request to
* driver's commit routine.
@@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type {
*/
enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+
+ void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+
+ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args);
+
+ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *args);
};
struct pnfs_layout_hdr {
@@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev);
extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
/* pnfs.c */
void get_layout_hdr(struct pnfs_layout_hdr *lo);
void put_lseg(struct pnfs_layout_segment *lseg);
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
- enum pnfs_iomode access_type, gfp_t gfp_flags);
+ loff_t pos, u64 count, enum pnfs_iomode access_type,
+ gfp_t gfp_flags);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
const struct rpc_call_ops *, int);
enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
const struct rpc_call_ops *);
-void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
-void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
+bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
@@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct nfs4_state *open_state);
int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- u32 iomode);
+ struct pnfs_layout_range *recall_range);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int _pnfs_return_layout(struct inode *);
+int pnfs_ld_write_done(struct nfs_write_data *);
+int pnfs_ld_read_done(struct nfs_read_data *);
+
+/* pnfs_dev.c */
+struct nfs4_deviceid_node {
+ struct hlist_node node;
+ const struct pnfs_layoutdriver_type *ld;
+ const struct nfs_client *nfs_client;
+ struct nfs4_deviceid deviceid;
+ atomic_t ref;
+};
+
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
+ const struct pnfs_layoutdriver_type *,
+ const struct nfs_client *,
+ const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
+bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_deviceid_purge_client(const struct nfs_client *);
static inline int lo_fail_bit(u32 iomode)
{
@@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
put_lseg(req->wb_commit_lseg);
}
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_SETATTR;
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_server *nfss = NFS_SERVER(ino);
+
+ if (pnfs_enabled_sb(nfss) && nfsi->layout)
+ return _pnfs_return_layout(ino);
+
+ return 0;
+}
+
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld)
+ pgio->pg_test = ld->pg_test;
+}
+
#else /* CONFIG_NFS_V4_1 */
static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
static inline struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
- enum pnfs_iomode access_type, gfp_t gfp_flags)
+ loff_t pos, u64 count, enum pnfs_iomode access_type,
+ gfp_t gfp_flags)
{
return NULL;
}
@@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
return PNFS_NOT_ATTEMPTED;
}
+static inline int pnfs_return_layout(struct inode *ino)
+{
+ return 0;
+}
+
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ return false;
+}
+
static inline bool
pnfs_roc(struct inode *ino)
{
@@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
{
}
-static inline void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
-{
- pgio->pg_test = NULL;
-}
-
-static inline void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode)
{
- pgio->pg_test = NULL;
}
static inline void
@@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
return 0;
}
+
+static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
+{
+}
#endif /* CONFIG_NFS_V4_1 */
#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 00000000000..c65e133ce9c
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,270 @@
+/*
+ * Device operations for the pnfs client.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ * Garth Goodson <Garth.Goodson@netapp.com>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
+
+/*
+ * Device ID RCU cache. A device ID is unique per server and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS 5
+#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+
+void
+nfs4_print_deviceid(const struct nfs4_deviceid *id)
+{
+ u32 *p = (u32 *)id;
+
+ dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+ p[0], p[1], p[2], p[3]);
+}
+EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+
+static inline u32
+nfs4_deviceid_hash(const struct nfs4_deviceid *id)
+{
+ unsigned char *cptr = (unsigned char *)id->data;
+ unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+ u32 x = 0;
+
+ while (nbytes--) {
+ x *= 37;
+ x += *cptr++;
+ }
+ return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+static struct nfs4_deviceid_node *
+_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id,
+ long hash)
+{
+ struct nfs4_deviceid_node *d;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+ if (d->ld == ld && d->nfs_client == clp &&
+ !memcmp(&d->deviceid, id, sizeof(*id))) {
+ if (atomic_read(&d->ref))
+ return d;
+ else
+ continue;
+ }
+ return NULL;
+}
+
+/*
+ * Lookup a deviceid in cache and get a reference count on it if found
+ *
+ * @clp nfs_client associated with deviceid
+ * @id deviceid to look up
+ */
+struct nfs4_deviceid_node *
+_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id,
+ long hash)
+{
+ struct nfs4_deviceid_node *d;
+
+ rcu_read_lock();
+ d = _lookup_deviceid(ld, clp, id, hash);
+ if (d && !atomic_inc_not_zero(&d->ref))
+ d = NULL;
+ rcu_read_unlock();
+ return d;
+}
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+ return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+
+/*
+ * Unhash and put deviceid
+ *
+ * @clp nfs_client associated with deviceid
+ * @id the deviceid to unhash
+ *
+ * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
+ */
+struct nfs4_deviceid_node *
+nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+ struct nfs4_deviceid_node *d;
+
+ spin_lock(&nfs4_deviceid_lock);
+ rcu_read_lock();
+ d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+ rcu_read_unlock();
+ if (!d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ return NULL;
+ }
+ hlist_del_init_rcu(&d->node);
+ spin_unlock(&nfs4_deviceid_lock);
+ synchronize_rcu();
+
+ /* balance the initial ref set in pnfs_insert_deviceid */
+ if (atomic_dec_and_test(&d->ref))
+ return d;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
+
+/*
+ * Delete a deviceid from cache
+ *
+ * @clp struct nfs_client qualifying the deviceid
+ * @id deviceid to delete
+ */
+void
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+ struct nfs4_deviceid_node *d;
+
+ d = nfs4_unhash_put_deviceid(ld, clp, id);
+ if (!d)
+ return;
+ d->ld->free_deviceid_node(d);
+}
+EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
+ const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *nfs_client,
+ const struct nfs4_deviceid *id)
+{
+ INIT_HLIST_NODE(&d->node);
+ d->ld = ld;
+ d->nfs_client = nfs_client;
+ d->deviceid = *id;
+ atomic_set(&d->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
+
+/*
+ * Uniquely initialize and insert a deviceid node into cache
+ *
+ * @new new deviceid node
+ * Note that the caller must set up the following members:
+ * new->ld
+ * new->nfs_client
+ * new->deviceid
+ *
+ * @ret the inserted node, if none found, otherwise, the found entry.
+ */
+struct nfs4_deviceid_node *
+nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
+{
+ struct nfs4_deviceid_node *d;
+ long hash;
+
+ spin_lock(&nfs4_deviceid_lock);
+ hash = nfs4_deviceid_hash(&new->deviceid);
+ d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
+ if (d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ return d;
+ }
+
+ hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+ spin_unlock(&nfs4_deviceid_lock);
+
+ return new;
+}
+EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
+
+/*
+ * Dereference a deviceid node and delete it when its reference count drops
+ * to zero.
+ *
+ * @d deviceid node to put
+ *
+ * @ret true iff the node was deleted
+ */
+bool
+nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
+ return false;
+ hlist_del_init_rcu(&d->node);
+ spin_unlock(&nfs4_deviceid_lock);
+ synchronize_rcu();
+ d->ld->free_deviceid_node(d);
+ return true;
+}
+EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+
+static void
+_deviceid_purge_client(const struct nfs_client *clp, long hash)
+{
+ struct nfs4_deviceid_node *d;
+ struct hlist_node *n, *next;
+ HLIST_HEAD(tmp);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+ if (d->nfs_client == clp && atomic_read(&d->ref)) {
+ hlist_del_init_rcu(&d->node);
+ hlist_add_head(&d->node, &tmp);
+ }
+ rcu_read_unlock();
+
+ if (hlist_empty(&tmp))
+ return;
+
+ synchronize_rcu();
+ hlist_for_each_entry_safe(d, n, next, &tmp, node)
+ if (atomic_dec_and_test(&d->ref))
+ d->ld->free_deviceid_node(d);
+}
+
+void
+nfs4_deviceid_purge_client(const struct nfs_client *clp)
+{
+ long h;
+
+ spin_lock(&nfs4_deviceid_lock);
+ for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
+ _deviceid_purge_client(clp, h);
+ spin_unlock(&nfs4_deviceid_lock);
+}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2bcf0dc306a..20a7f952e24 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
atomic_set(&req->wb_complete, requests);
BUG_ON(desc->pg_lseg != NULL);
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_READ, GFP_KERNEL);
ClearPageError(page);
offset = 0;
nbytes = desc->pg_count;
@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
}
req = nfs_list_entry(data->pages.next);
if ((!lseg) && list_is_singular(&data->pages))
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_READ, GFP_KERNEL);
ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
0, lseg);
@@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0)
goto read_complete; /* all pages were read */
- pnfs_pageio_init_read(&pgio, inode);
if (rsize < PAGE_CACHE_SIZE)
nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e288f06d3fa..ce40e5c568b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -63,6 +63,7 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
return 0;
}
+#ifdef CONFIG_NFS_V4_1
+void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+ if (nfs4_has_session(server->nfs_client))
+ seq_printf(m, ",sessions");
+}
+#else
+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+ seq_printf(m, ",pnfs=");
+ if (server->pnfs_curr_ld)
+ seq_printf(m, "%s", server->pnfs_curr_ld->name);
+ else
+ seq_printf(m, "not configured");
+}
+#else /* CONFIG_NFS_V4_1 */
+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif /* CONFIG_NFS_V4_1 */
static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
{
@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+ show_sessions(m, nfss);
+ show_pnfs(m, nfss);
}
#endif
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 49c715b4ac9..e268e3b2349 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
atomic_set(&req->wb_complete, requests);
BUG_ON(desc->pg_lseg);
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_RW, GFP_NOFS);
ClearPageError(page);
offset = 0;
nbytes = desc->pg_count;
@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
}
req = nfs_list_entry(data->pages.next);
if ((!lseg) && list_is_singular(&data->pages))
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_RW, GFP_NOFS);
if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
@@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
{
size_t wsize = NFS_SERVER(inode)->wsize;
- pnfs_pageio_init_write(pgio, inode);
-
if (wsize < PAGE_CACHE_SIZE)
nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
else
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index ad000aeb21a..b9566e46219 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1354,12 +1354,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (IS_ERR(exp))
return nfserrno(PTR_ERR(exp));
rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
- if (rv)
- goto out;
- rv = check_nfsd_access(exp, rqstp);
- if (rv)
- fh_put(fhp);
-out:
exp_put(exp);
return rv;
}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 2247fc91d5e..9095f3c21df 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -245,7 +245,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
}
/* Now create the file and set attributes */
- nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len,
+ nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
attr, newfhp,
argp->createmode, argp->verf, NULL, NULL);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index ad48faca20f..08c6e36ab2e 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -842,7 +842,7 @@ out:
return rv;
}
-__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
{
struct svc_fh fh;
int err;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5fcb1396a7e..3a6dbd70b34 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -196,9 +196,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
/*
* Note: create modes (UNCHECKED,GUARDED...) are the same
- * in NFSv4 as in v3.
+ * in NFSv4 as in v3 except EXCLUSIVE4_1.
*/
- status = nfsd_create_v3(rqstp, current_fh, open->op_fname.data,
+ status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
open->op_fname.len, &open->op_iattr,
&resfh, open->op_createmode,
(u32 *)open->op_verf.data,
@@ -403,7 +403,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
putfh->pf_fhlen);
- return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+ return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
}
static __be32
@@ -762,6 +762,9 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 err;
fh_init(&resfh, NFS4_FHSIZE);
+ err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
+ if (err)
+ return err;
err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
secinfo->si_name, secinfo->si_namelen,
&exp, &dentry);
@@ -986,6 +989,9 @@ enum nfsd4_op_flags {
ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */
+ /* For rfc 5661 section 2.6.3.1.1: */
+ OP_HANDLES_WRONGSEC = 1 << 3,
+ OP_IS_PUTFH_LIKE = 1 << 4,
};
struct nfsd4_operation {
@@ -1031,6 +1037,44 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
return nfs_ok;
}
+static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
+{
+ return &nfsd4_ops[op->opnum];
+}
+
+static bool need_wrongsec_check(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+ struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+ struct nfsd4_op *next = &argp->ops[resp->opcnt];
+ struct nfsd4_operation *thisd;
+ struct nfsd4_operation *nextd;
+
+ thisd = OPDESC(this);
+ /*
+ * Most ops check wronsec on our own; only the putfh-like ops
+ * have special rules.
+ */
+ if (!(thisd->op_flags & OP_IS_PUTFH_LIKE))
+ return false;
+ /*
+ * rfc 5661 2.6.3.1.1.6: don't bother erroring out a
+ * put-filehandle operation if we're not going to use the
+ * result:
+ */
+ if (argp->opcnt == resp->opcnt)
+ return false;
+
+ nextd = OPDESC(next);
+ /*
+ * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
+ * errors themselves as necessary; others should check for them
+ * now:
+ */
+ return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
+}
+
/*
* COMPOUND call.
*/
@@ -1108,7 +1152,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
goto encode_op;
}
- opdesc = &nfsd4_ops[op->opnum];
+ opdesc = OPDESC(op);
if (!cstate->current_fh.fh_dentry) {
if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
@@ -1126,6 +1170,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
else
BUG_ON(op->status == nfs_ok);
+ if (!op->status && need_wrongsec_check(rqstp))
+ op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
+
encode_op:
/* Only from SEQUENCE */
if (resp->cstate.status == nfserr_replay_cache) {
@@ -1217,10 +1264,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LOOKUP] = {
.op_func = (nfsd4op_func)nfsd4_lookup,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_LOOKUP",
},
[OP_LOOKUPP] = {
.op_func = (nfsd4op_func)nfsd4_lookupp,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_LOOKUPP",
},
[OP_NVERIFY] = {
@@ -1229,6 +1278,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_OPEN] = {
.op_func = (nfsd4op_func)nfsd4_open,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_OPEN",
},
[OP_OPEN_CONFIRM] = {
@@ -1241,17 +1291,20 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_PUTFH] = {
.op_func = (nfsd4op_func)nfsd4_putfh,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE,
.op_name = "OP_PUTFH",
},
[OP_PUTPUBFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE,
.op_name = "OP_PUTPUBFH",
},
[OP_PUTROOTFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE,
.op_name = "OP_PUTROOTFH",
},
[OP_READ] = {
@@ -1281,15 +1334,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_RESTOREFH] = {
.op_func = (nfsd4op_func)nfsd4_restorefh,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE,
.op_name = "OP_RESTOREFH",
},
[OP_SAVEFH] = {
.op_func = (nfsd4op_func)nfsd4_savefh,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SAVEFH",
},
[OP_SECINFO] = {
.op_func = (nfsd4op_func)nfsd4_secinfo,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SECINFO",
},
[OP_SETATTR] = {
@@ -1353,6 +1409,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_SECINFO_NO_NAME] = {
.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SECINFO_NO_NAME",
},
};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4cf04e11c66..e98f3c2e949 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1519,6 +1519,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
bool confirm_me = false;
int status = 0;
+ if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
+ return nfserr_inval;
+
nfs4_lock_state();
unconf = find_unconfirmed_client(&cr_ses->clientid);
conf = find_confirmed_client(&cr_ses->clientid);
@@ -1637,8 +1640,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
return nfserr_badsession;
status = nfsd4_map_bcts_dir(&bcts->dir);
- nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
- return nfs_ok;
+ if (!status)
+ nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+ return status;
}
static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -1725,6 +1729,13 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
return;
}
+static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
+{
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+ return args->opcnt > session->se_fchannel.maxops;
+}
+
__be32
nfsd4_sequence(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
@@ -1753,6 +1764,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (!session)
goto out;
+ status = nfserr_too_many_ops;
+ if (nfsd4_session_too_many_ops(rqstp, session))
+ goto out;
+
status = nfserr_badslot;
if (seq->slotid >= session->se_fchannel.maxreqs)
goto out;
@@ -1808,6 +1823,8 @@ out:
__be32
nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
{
+ int status = 0;
+
if (rc->rca_one_fs) {
if (!cstate->current_fh.fh_dentry)
return nfserr_nofilehandle;
@@ -1817,9 +1834,14 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
*/
return nfs_ok;
}
+
nfs4_lock_state();
- if (is_client_expired(cstate->session->se_client)) {
- nfs4_unlock_state();
+ status = nfserr_complete_already;
+ if (cstate->session->se_client->cl_firststate)
+ goto out;
+
+ status = nfserr_stale_clientid;
+ if (is_client_expired(cstate->session->se_client))
/*
* The following error isn't really legal.
* But we only get here if the client just explicitly
@@ -1827,11 +1849,13 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
* error it gets back on an operation for the dead
* client.
*/
- return nfserr_stale_clientid;
- }
+ goto out;
+
+ status = nfs_ok;
nfsd4_create_clid_dir(cstate->session->se_client);
+out:
nfs4_unlock_state();
- return nfs_ok;
+ return status;
}
__be32
@@ -2462,7 +2486,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
return NULL;
}
-int share_access_to_flags(u32 share_access)
+static int share_access_to_flags(u32 share_access)
{
share_access &= ~NFS4_SHARE_WANT_MASK;
@@ -2882,7 +2906,7 @@ out:
return status;
}
-struct lock_manager nfsd4_manager = {
+static struct lock_manager nfsd4_manager = {
};
static void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c6766af00d9..99018110321 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -424,15 +424,12 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
{
DECODE_HEAD;
- u32 dummy;
READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
READ32(bcts->dir);
- /* XXX: Perhaps Tom Tucker could help us figure out how we
- * should be using ctsa_use_conn_in_rdma_mode: */
- READ32(dummy);
-
+ /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker
+ * could help us figure out we should be using it. */
DECODE_TAIL;
}
@@ -588,8 +585,6 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
READ_BUF(lockt->lt_owner.len);
READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
- if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
- return nfserr_inval;
DECODE_TAIL;
}
@@ -3120,7 +3115,7 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
return nfserr;
}
-__be32
+static __be32
nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
struct nfsd4_sequence *seq)
{
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 55c8e63af0b..90c6aa6d5e0 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -344,7 +344,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
* which clients virtually always use auth_sys for,
* even while using RPCSEC_GSS for NFS.
*/
- if (access & NFSD_MAY_LOCK)
+ if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS)
goto skip_pseudoflavor_check;
/*
* Clients may expect to be able to use auth_sys during mount,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 129f3c9f62d..d5718273bb3 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -181,16 +181,10 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct svc_export *exp;
struct dentry *dparent;
struct dentry *dentry;
- __be32 err;
int host_err;
dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
- /* Obtain dentry and export. */
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
- if (err)
- return err;
-
dparent = fhp->fh_dentry;
exp = fhp->fh_export;
exp_get(exp);
@@ -254,6 +248,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
struct dentry *dentry;
__be32 err;
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (err)
+ return err;
err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
if (err)
return err;
@@ -877,13 +874,11 @@ static __be32
nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
{
- struct inode *inode;
mm_segment_t oldfs;
__be32 err;
int host_err;
err = nfserr_perm;
- inode = file->f_path.dentry->d_inode;
if (file->f_op->splice_read && rqstp->rq_splice_ok) {
struct splice_desc sd = {
@@ -1340,11 +1335,18 @@ out_nfserr:
}
#ifdef CONFIG_NFSD_V3
+
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+ return createmode == NFS3_CREATE_EXCLUSIVE
+ || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
/*
- * NFSv3 version of nfsd_create
+ * NFSv3 and NFSv4 version of nfsd_create
*/
__be32
-nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
+do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen, struct iattr *iap,
struct svc_fh *resfhp, int createmode, u32 *verifier,
int *truncp, int *created)
@@ -1396,7 +1398,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err)
goto out;
- if (createmode == NFS3_CREATE_EXCLUSIVE) {
+ if (nfsd_create_is_exclusive(createmode)) {
/* solaris7 gets confused (bugid 4218508) if these have
* the high bit set, so just clear the high bits. If this is
* ever changed to use different attrs for storing the
@@ -1437,6 +1439,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
&& dchild->d_inode->i_atime.tv_sec == v_atime
&& dchild->d_inode->i_size == 0 )
break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
+ && dchild->d_inode->i_atime.tv_sec == v_atime
+ && dchild->d_inode->i_size == 0 )
+ goto set_attr;
/* fallthru */
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
@@ -1455,7 +1462,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_check_ignore_resizing(iap);
- if (createmode == NFS3_CREATE_EXCLUSIVE) {
+ if (nfsd_create_is_exclusive(createmode)) {
/* Cram the verifier into atime/mtime */
iap->ia_valid = ATTR_MTIME|ATTR_ATIME
| ATTR_MTIME_SET|ATTR_ATIME_SET;
@@ -2034,7 +2041,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
struct inode *inode = dentry->d_inode;
int err;
- if (acc == NFSD_MAY_NOP)
+ if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
return 0;
#if 0
dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 9a370a5e36b..e0bbac04d1d 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -17,10 +17,14 @@
#define NFSD_MAY_SATTR 8
#define NFSD_MAY_TRUNC 16
#define NFSD_MAY_LOCK 32
+#define NFSD_MAY_MASK 63
+
+/* extra hints to permission and open routines: */
#define NFSD_MAY_OWNER_OVERRIDE 64
#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
#define NFSD_MAY_NOT_BREAK_LEASE 512
+#define NFSD_MAY_BYPASS_GSS 1024
#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -54,7 +58,7 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
int type, dev_t rdev, struct svc_fh *res);
#ifdef CONFIG_NFSD_V3
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
-__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
+__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
struct svc_fh *res, int createmode,
u32 *verifier, int *truncp, int *created);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 587f1843283..b954878ad6c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -917,7 +917,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
* construction. This function can be called both as a single operation
* and as a part of indivisible file operations.
*/
-void nilfs_dirty_inode(struct inode *inode)
+void nilfs_dirty_inode(struct inode *inode, int flags)
{
struct nilfs_transaction_info ti;
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1102a5fbb74..546849b3e88 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,8 +334,6 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
struct nilfs_transaction_info ti;
int err;
- dentry_unhash(dentry);
-
err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
if (err)
return err;
@@ -371,9 +369,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct nilfs_transaction_info ti;
int err;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
if (unlikely(err))
return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a9c6a531f80..f02b9ad43a2 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -269,7 +269,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
extern int nilfs_mark_inode_dirty(struct inode *);
-extern void nilfs_dirty_inode(struct inode *);
+extern void nilfs_dirty_inode(struct inode *, int flags);
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c368360c35a..3b8d3979e03 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -241,11 +241,9 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
int ret;
- if (S_ISDIR(inode->i_mode)) {
- dentry_unhash(dentry);
- if (!omfs_dir_is_empty(inode))
- return -ENOTEMPTY;
- }
+ if (S_ISDIR(inode->i_mode) &&
+ !omfs_dir_is_empty(inode))
+ return -ENOTEMPTY;
ret = omfs_delete_entry(dentry);
if (ret)
@@ -382,9 +380,6 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int err;
if (new_inode) {
- if (S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
/* overwriting existing file/dir */
err = omfs_remove(new_dir, new_dentry);
if (err)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4ede550517a..14def991d9d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,9 @@
#include <linux/pid_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
+#ifdef CONFIG_HARDWALL
+#include <asm/hardwall.h>
+#endif
#include "internal.h"
/* NOTE:
@@ -2842,6 +2845,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
INF("io", S_IRUGO, proc_tgid_io_accounting),
#endif
+#ifdef CONFIG_HARDWALL
+ INF("hardwall", S_IRUGO, proc_pid_hardwall),
+#endif
};
static int proc_tgid_base_readdir(struct file * filp,
@@ -3181,6 +3187,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
INF("io", S_IRUGO, proc_tid_io_accounting),
#endif
+#ifdef CONFIG_HARDWALL
+ INF("hardwall", S_IRUGO, proc_pid_hardwall),
+#endif
};
static int proc_tid_base_readdir(struct file * filp,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 76c8164d565..118662690cd 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,8 +831,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
INITIALIZE_PATH(path);
struct reiserfs_dir_entry de;
- dentry_unhash(dentry);
-
/* we will be doing 2 balancings and update 2 stat data, we change quotas
* of the owner of the directory and of the owner of the parent directory.
* The quota structure is possibly deleted only on last iput => outside
@@ -1227,9 +1225,6 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned long savelink = 1;
struct timespec ctime;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
/* three balancings: (1) old name removal, (2) new name insertion
and (3) maybe "save" link insertion
stat data updates: (1) old directory,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b216ff6be1c..aa91089162c 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -568,7 +568,7 @@ static void destroy_inodecache(void)
}
/* we don't mark inodes dirty, we just log them */
-static void reiserfs_dirty_inode(struct inode *inode)
+static void reiserfs_dirty_inode(struct inode *inode, int flags)
{
struct reiserfs_transaction_handle th;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 50f1abccd1c..e8a62f41b45 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -98,7 +98,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
I_MUTEX_CHILD, dir->i_sb);
- dentry_unhash(dentry);
error = dir->i_op->rmdir(dir, dentry);
if (!error)
dentry->d_inode->i_flags |= S_DEAD;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 730c56248c9..5e1101ff276 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -147,7 +147,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
* table[0] points to the first inode lookup table metadata block,
* this should be less than lookup_table_start
*/
- if (!IS_ERR(table) && table[0] >= lookup_table_start) {
+ if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 1516a6490bf..0ed6edbc5c7 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -90,7 +90,7 @@ __le64 *squashfs_read_fragment_index_table(struct super_block *sb,
* table[0] points to the first fragment table metadata block, this
* should be less than fragment_table_start
*/
- if (!IS_ERR(table) && table[0] >= fragment_table_start) {
+ if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index a70858e0fb4..d38ea3dab95 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -93,7 +93,7 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
* table[0] points to the first id lookup table metadata block, this
* should be less than id_table_start
*/
- if (!IS_ERR(table) && table[0] >= id_table_start) {
+ if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6f26abee359..7438850c62d 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -245,7 +245,7 @@ allocate_id_index_table:
msblk->id_table = NULL;
goto failed_mount;
}
- next_table = msblk->id_table[0];
+ next_table = le64_to_cpu(msblk->id_table[0]);
/* Handle inode lookup table */
lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
@@ -261,7 +261,7 @@ allocate_id_index_table:
msblk->inode_lookup_table = NULL;
goto failed_mount;
}
- next_table = msblk->inode_lookup_table[0];
+ next_table = le64_to_cpu(msblk->inode_lookup_table[0]);
sb->s_export_op = &squashfs_export_ops;
@@ -286,7 +286,7 @@ handle_fragments:
msblk->fragment_index = NULL;
goto failed_mount;
}
- next_table = msblk->fragment_index[0];
+ next_table = le64_to_cpu(msblk->fragment_index[0]);
check_directory_table:
/* Sanity check directory_table */
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e2cc6756f3b..e474fbcf8bd 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,8 +196,6 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
struct inode *inode = dentry->d_inode;
int err = -ENOTEMPTY;
- dentry_unhash(dentry);
-
if (sysv_empty_dir(inode)) {
err = sysv_unlink(dir, dentry);
if (!err) {
@@ -224,9 +222,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
struct sysv_dir_entry * old_de;
int err = -ENOENT;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_de = sysv_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index c2b80943560..ef5abd38f0b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,8 +656,6 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
- dentry_unhash(dentry);
-
/*
* Budget request settings: deletion direntry, deletion inode and
* changing the parent inode. If budgeting fails, go ahead anyway
@@ -978,9 +976,6 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct timespec time;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
/*
* Budget request settings: deletion direntry, new direntry, removing
* the old inode, and changing old and new parent directory inodes.
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 166951e0dcd..3be645e012c 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -581,6 +581,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
ubifs_assert(wbuf->size % c->min_io_size == 0);
ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
ubifs_assert(!c->ro_media && !c->ro_mount);
+ ubifs_assert(!c->space_fixup);
if (c->leb_size - wbuf->offs >= c->max_write_size)
ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
@@ -759,6 +760,7 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
ubifs_assert(!c->ro_media && !c->ro_mount);
+ ubifs_assert(!c->space_fixup);
if (c->ro_error)
return -EROFS;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 34b1679e6e3..cef0460f4c5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -669,6 +669,7 @@ out_free:
out_release:
release_head(c, BASEHD);
+ kfree(dent);
out_ro:
ubifs_ro_mode(c, err);
if (last_reference)
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index bd644bf587a..a5422fffbd6 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -674,7 +674,7 @@ static int kill_orphans(struct ubifs_info *c)
if (IS_ERR(sleb)) {
if (PTR_ERR(sleb) == -EUCLEAN)
sleb = ubifs_recover_leb(c, lnum, 0,
- c->sbuf, 0);
+ c->sbuf, -1);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 731d9e2e7b5..783d8e0beb7 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,19 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
}
/**
- * drop_last_node - drop the last node or group of nodes.
+ * drop_last_group - drop the last group of nodes.
* @sleb: scanned LEB information
* @offs: offset of dropped nodes is returned here
- * @grouped: non-zero if whole group of nodes have to be dropped
*
* This is a helper function for 'ubifs_recover_leb()' which drops the last
- * node of the scanned LEB or the last group of nodes if @grouped is not zero.
- * This function returns %1 if a node was dropped and %0 otherwise.
+ * group of nodes of the scanned LEB.
*/
-static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
+static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
{
- int dropped = 0;
-
while (!list_empty(&sleb->nodes)) {
struct ubifs_scan_node *snod;
struct ubifs_ch *ch;
@@ -585,17 +581,40 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
list);
ch = snod->node;
if (ch->group_type != UBIFS_IN_NODE_GROUP)
- return dropped;
- dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
+ break;
+
+ dbg_rcvry("dropping grouped node at %d:%d",
+ sleb->lnum, snod->offs);
+ *offs = snod->offs;
+ list_del(&snod->list);
+ kfree(snod);
+ sleb->nodes_cnt -= 1;
+ }
+}
+
+/**
+ * drop_last_node - drop the last node.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
+ *
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB.
+ */
+static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
+{
+ struct ubifs_scan_node *snod;
+
+ if (!list_empty(&sleb->nodes)) {
+ snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+ list);
+
+ dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
*offs = snod->offs;
list_del(&snod->list);
kfree(snod);
sleb->nodes_cnt -= 1;
- dropped = 1;
- if (!grouped)
- break;
}
- return dropped;
}
/**
@@ -604,7 +623,8 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
* @lnum: LEB number
* @offs: offset
* @sbuf: LEB-sized buffer to use
- * @grouped: nodes may be grouped for recovery
+ * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not
+ * belong to any journal head)
*
* This function does a scan of a LEB, but caters for errors that might have
* been caused by the unclean unmount from which we are attempting to recover.
@@ -612,13 +632,14 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
* found, and a negative error code in case of failure.
*/
struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
- int offs, void *sbuf, int grouped)
+ int offs, void *sbuf, int jhead)
{
int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
+ int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped;
struct ubifs_scan_leb *sleb;
void *buf = sbuf + offs;
- dbg_rcvry("%d:%d", lnum, offs);
+ dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped);
sleb = ubifs_start_scan(c, lnum, offs, sbuf);
if (IS_ERR(sleb))
@@ -635,7 +656,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
* Scan quietly until there is an error from which we cannot
* recover
*/
- ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
if (ret == SCANNED_A_NODE) {
/* A valid node, and not a padding node */
struct ubifs_ch *ch = buf;
@@ -695,59 +716,62 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
* If nodes are grouped, always drop the incomplete group at
* the end.
*/
- drop_last_node(sleb, &offs, 1);
+ drop_last_group(sleb, &offs);
- /*
- * While we are in the middle of the same min. I/O unit keep dropping
- * nodes. So basically, what we want is to make sure that the last min.
- * I/O unit where we saw the corruption is dropped completely with all
- * the uncorrupted node which may possibly sit there.
- *
- * In other words, let's name the min. I/O unit where the corruption
- * starts B, and the previous min. I/O unit A. The below code tries to
- * deal with a situation when half of B contains valid nodes or the end
- * of a valid node, and the second half of B contains corrupted data or
- * garbage. This means that UBIFS had been writing to B just before the
- * power cut happened. I do not know how realistic is this scenario
- * that half of the min. I/O unit had been written successfully and the
- * other half not, but this is possible in our 'failure mode emulation'
- * infrastructure at least.
- *
- * So what is the problem, why we need to drop those nodes? Whey can't
- * we just clean-up the second half of B by putting a padding node
- * there? We can, and this works fine with one exception which was
- * reproduced with power cut emulation testing and happens extremely
- * rarely. The description follows, but it is worth noting that that is
- * only about the GC head, so we could do this trick only if the bud
- * belongs to the GC head, but it does not seem to be worth an
- * additional "if" statement.
- *
- * So, imagine the file-system is full, we run GC which is moving valid
- * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
- * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
- * and will try to continue. Imagine that LEB X is currently the
- * dirtiest LEB, and the amount of used space in LEB Y is exactly the
- * same as amount of free space in LEB X.
- *
- * And a power cut happens when nodes are moved from LEB X to LEB Y. We
- * are here trying to recover LEB Y which is the GC head LEB. We find
- * the min. I/O unit B as described above. Then we clean-up LEB Y by
- * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
- * fails, because it cannot find a dirty LEB which could be GC'd into
- * LEB Y! Even LEB X does not match because the amount of valid nodes
- * there does not fit the free space in LEB Y any more! And this is
- * because of the padding node which we added to LEB Y. The
- * user-visible effect of this which I once observed and analysed is
- * that we cannot mount the file-system with -ENOSPC error.
- *
- * So obviously, to make sure that situation does not happen we should
- * free min. I/O unit B in LEB Y completely and the last used min. I/O
- * unit in LEB Y should be A. This is basically what the below code
- * tries to do.
- */
- while (min_io_unit == round_down(offs, c->min_io_size) &&
- min_io_unit != offs &&
- drop_last_node(sleb, &offs, grouped));
+ if (jhead == GCHD) {
+ /*
+ * If this LEB belongs to the GC head then while we are in the
+ * middle of the same min. I/O unit keep dropping nodes. So
+ * basically, what we want is to make sure that the last min.
+ * I/O unit where we saw the corruption is dropped completely
+ * with all the uncorrupted nodes which may possibly sit there.
+ *
+ * In other words, let's name the min. I/O unit where the
+ * corruption starts B, and the previous min. I/O unit A. The
+ * below code tries to deal with a situation when half of B
+ * contains valid nodes or the end of a valid node, and the
+ * second half of B contains corrupted data or garbage. This
+ * means that UBIFS had been writing to B just before the power
+ * cut happened. I do not know how realistic is this scenario
+ * that half of the min. I/O unit had been written successfully
+ * and the other half not, but this is possible in our 'failure
+ * mode emulation' infrastructure at least.
+ *
+ * So what is the problem, why we need to drop those nodes? Why
+ * can't we just clean-up the second half of B by putting a
+ * padding node there? We can, and this works fine with one
+ * exception which was reproduced with power cut emulation
+ * testing and happens extremely rarely.
+ *
+ * Imagine the file-system is full, we run GC which starts
+ * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
+ * the current GC head LEB). The @c->gc_lnum is -1, which means
+ * that GC will retain LEB X and will try to continue. Imagine
+ * that LEB X is currently the dirtiest LEB, and the amount of
+ * used space in LEB Y is exactly the same as amount of free
+ * space in LEB X.
+ *
+ * And a power cut happens when nodes are moved from LEB X to
+ * LEB Y. We are here trying to recover LEB Y which is the GC
+ * head LEB. We find the min. I/O unit B as described above.
+ * Then we clean-up LEB Y by padding min. I/O unit. And later
+ * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
+ * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
+ * does not match because the amount of valid nodes there does
+ * not fit the free space in LEB Y any more! And this is
+ * because of the padding node which we added to LEB Y. The
+ * user-visible effect of this which I once observed and
+ * analysed is that we cannot mount the file-system with
+ * -ENOSPC error.
+ *
+ * So obviously, to make sure that situation does not happen we
+ * should free min. I/O unit B in LEB Y completely and the last
+ * used min. I/O unit in LEB Y should be A. This is basically
+ * what the below code tries to do.
+ */
+ while (offs > min_io_unit)
+ drop_last_node(sleb, &offs);
+ }
buf = sbuf + offs;
len = c->leb_size - offs;
@@ -881,7 +905,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
}
ubifs_scan_destroy(sleb);
}
- return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
+ return ubifs_recover_leb(c, lnum, offs, sbuf, -1);
}
/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 6617280d167..5e97161ce4d 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -557,8 +557,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
* these LEBs could possibly be written to at the power cut
* time.
*/
- sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
- b->bud->jhead != GCHD);
+ sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead);
else
sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
if (IS_ERR(sleb))
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 46961c00323..9e1d05666fe 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,13 +277,18 @@ static int kick_a_thread(void)
return 0;
}
-int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
{
+ int nr = sc->nr_to_scan;
int freed, contention = 0;
long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
if (nr == 0)
- return clean_zn_cnt;
+ /*
+ * Due to the way UBIFS updates the clean znode counter it may
+ * temporarily be negative.
+ */
+ return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
if (!clean_zn_cnt) {
/*
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6db0bdaa9f7..b5aeb5a8ebe 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -382,7 +382,7 @@ done:
end_writeback(inode);
}
-static void ubifs_dirty_inode(struct inode *inode)
+static void ubifs_dirty_inode(struct inode *inode, int flags)
{
struct ubifs_inode *ui = ubifs_inode(inode);
@@ -811,15 +811,18 @@ static int alloc_wbufs(struct ubifs_info *c)
c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
c->jheads[i].wbuf.jhead = i;
+ c->jheads[i].grouped = 1;
}
c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
/*
* Garbage Collector head likely contains long-term data and
- * does not need to be synchronized by timer.
+ * does not need to be synchronized by timer. Also GC head nodes are
+ * not grouped.
*/
c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
c->jheads[GCHD].wbuf.no_timer = 1;
+ c->jheads[GCHD].grouped = 0;
return 0;
}
@@ -1284,12 +1287,25 @@ static int mount_ubifs(struct ubifs_info *c)
if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
ubifs_msg("recovery needed");
c->need_recovery = 1;
- if (!c->ro_mount) {
- err = ubifs_recover_inl_heads(c, c->sbuf);
- if (err)
- goto out_master;
- }
- } else if (!c->ro_mount) {
+ }
+
+ if (c->need_recovery && !c->ro_mount) {
+ err = ubifs_recover_inl_heads(c, c->sbuf);
+ if (err)
+ goto out_master;
+ }
+
+ err = ubifs_lpt_init(c, 1, !c->ro_mount);
+ if (err)
+ goto out_master;
+
+ if (!c->ro_mount && c->space_fixup) {
+ err = ubifs_fixup_free_space(c);
+ if (err)
+ goto out_master;
+ }
+
+ if (!c->ro_mount) {
/*
* Set the "dirty" flag so that if we reboot uncleanly we
* will notice this immediately on the next mount.
@@ -1297,13 +1313,9 @@ static int mount_ubifs(struct ubifs_info *c)
c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
err = ubifs_write_master(c);
if (err)
- goto out_master;
+ goto out_lpt;
}
- err = ubifs_lpt_init(c, 1, !c->ro_mount);
- if (err)
- goto out_lpt;
-
err = dbg_check_idx_size(c, c->bi.old_idx_sz);
if (err)
goto out_lpt;
@@ -1396,12 +1408,6 @@ static int mount_ubifs(struct ubifs_info *c)
} else
ubifs_assert(c->lst.taken_empty_lebs > 0);
- if (!c->ro_mount && c->space_fixup) {
- err = ubifs_fixup_free_space(c);
- if (err)
- goto out_infos;
- }
-
err = dbg_check_filesystem(c);
if (err)
goto out_infos;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8119b1fd8d9..91b4213dde8 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2876,12 +2876,13 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
*/
void ubifs_tnc_close(struct ubifs_info *c)
{
- long clean_freed;
-
tnc_destroy_cnext(c);
if (c->zroot.znode) {
- clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
- atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
+ long n;
+
+ ubifs_destroy_tnc_subtree(c->zroot.znode);
+ n = atomic_long_read(&c->clean_zn_cnt);
+ atomic_long_sub(n, &ubifs_clean_zn_cnt);
}
kfree(c->gap_lebs);
kfree(c->ilebs);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 93d1412a06f..f79983d6f86 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -722,12 +722,14 @@ struct ubifs_bud {
* struct ubifs_jhead - journal head.
* @wbuf: head's write-buffer
* @buds_list: list of bud LEBs belonging to this journal head
+ * @grouped: non-zero if UBIFS groups nodes when writing to this journal head
*
* Note, the @buds list is protected by the @c->buds_lock.
*/
struct ubifs_jhead {
struct ubifs_wbuf wbuf;
struct list_head buds_list;
+ unsigned int grouped:1;
};
/**
@@ -1614,7 +1616,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
int ubifs_tnc_end_commit(struct ubifs_info *c);
/* shrinker.c */
-int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
/* commit.c */
int ubifs_bg_thread(void *info);
@@ -1742,7 +1744,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
int ubifs_recover_master_node(struct ubifs_info *c);
int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
- int offs, void *sbuf, int grouped);
+ int offs, void *sbuf, int jhead);
struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
int offs, void *sbuf);
int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 4d76594c2a8..f1dce848ef9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,8 +783,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc *fi, cfi;
struct kernel_lb_addr tloc;
- dentry_unhash(dentry);
-
retval = -ENOENT;
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
if (!fi)
@@ -1083,9 +1081,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct kernel_lb_addr tloc;
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
if (ofi) {
if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 953ebdfc5bf..29309e25417 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,8 +258,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
struct inode * inode = dentry->d_inode;
int err= -ENOTEMPTY;
- dentry_unhash(dentry);
-
lock_ufs(dir->i_sb);
if (ufs_empty_dir (inode)) {
err = ufs_unlink(dir, dentry);
@@ -284,9 +282,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ufs_dir_entry *old_de;
int err = -ENOENT;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/xattr.c b/fs/xattr.c
index f1ef94974de..f060663ab70 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -46,18 +46,22 @@ xattr_permission(struct inode *inode, const char *name, int mask)
return 0;
/*
- * The trusted.* namespace can only be accessed by a privileged user.
+ * The trusted.* namespace can only be accessed by privileged users.
*/
- if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
- return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
+ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+ if (!capable(CAP_SYS_ADMIN))
+ return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
+ return 0;
+ }
- /* In user.* namespace, only regular files and directories can have
+ /*
+ * In the user.* namespace, only regular files and directories can have
* extended attributes. For sticky directories, only the owner and
- * privileged user can write attributes.
+ * privileged users can write attributes.
*/
if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
- return -EPERM;
+ return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
(mask & MAY_WRITE) && !inode_owner_or_capable(inode))
return -EPERM;
@@ -87,7 +91,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
{
struct inode *inode = dentry->d_inode;
int error = -EOPNOTSUPP;
+ int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
+ if (issec)
+ inode->i_flags &= ~S_NOSEC;
if (inode->i_op->setxattr) {
error = inode->i_op->setxattr(dentry, name, value, size, flags);
if (!error) {
@@ -95,8 +103,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
security_inode_post_setxattr(dentry, name, value,
size, flags);
}
- } else if (!strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN)) {
+ } else if (issec) {
const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
error = security_inode_setsecurity(inode, suffix, value,
size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 98b9c91fcdf..1e3a7ce804d 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -925,7 +925,8 @@ xfs_fs_inode_init_once(
*/
STATIC void
xfs_fs_dirty_inode(
- struct inode *inode)
+ struct inode *inode,
+ int flags)
{
barrier();
XFS_I(inode)->i_update_core = 1;