summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/autofs4/root.c12
-rw-r--r--fs/bio.c23
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/compression.c15
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/disk-io.c41
-rw-r--r--fs/btrfs/export.c78
-rw-r--r--fs/btrfs/extent-tree.c77
-rw-r--r--fs/btrfs/extent_io.c77
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/file.c99
-rw-r--r--fs/btrfs/free-space-cache.c12
-rw-r--r--fs/btrfs/inode.c299
-rw-r--r--fs/btrfs/ioctl.c87
-rw-r--r--fs/btrfs/ioctl.h14
-rw-r--r--fs/btrfs/ordered-data.c67
-rw-r--r--fs/btrfs/ordered-data.h3
-rw-r--r--fs/btrfs/orphan.c6
-rw-r--r--fs/btrfs/super.c43
-rw-r--r--fs/btrfs/transaction.c5
-rw-r--r--fs/btrfs/tree-log.c21
-rw-r--r--fs/btrfs/volumes.c20
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/caps.c17
-rw-r--r--fs/ceph/dir.c23
-rw-r--r--fs/ceph/file.c65
-rw-r--r--fs/ceph/inode.c50
-rw-r--r--fs/ceph/ioctl.h2
-rw-r--r--fs/ceph/locks.c94
-rw-r--r--fs/ceph/mds_client.c49
-rw-r--r--fs/ceph/mds_client.h33
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/cifs/Kconfig8
-rw-r--r--fs/cifs/Makefile4
-rw-r--r--fs/cifs/README9
-rw-r--r--fs/cifs/TODO2
-rw-r--r--fs/cifs/cifs_fs_sb.h7
-rw-r--r--fs/cifs/cifsacl.c51
-rw-r--r--fs/cifs/cifsacl.h4
-rw-r--r--fs/cifs/cifsfs.c10
-rw-r--r--fs/cifs/cifsglob.h15
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c183
-rw-r--r--fs/cifs/connect.c242
-rw-r--r--fs/cifs/dns_resolve.c2
-rw-r--r--fs/cifs/file.c78
-rw-r--r--fs/cifs/fscache.c12
-rw-r--r--fs/cifs/inode.c58
-rw-r--r--fs/cifs/ioctl.c16
-rw-r--r--fs/cifs/misc.c25
-rw-r--r--fs/cifs/readdir.c41
-rw-r--r--fs/cifs/xattr.c55
-rw-r--r--fs/compat.c28
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/exec.c41
-rw-r--r--fs/ext3/super.c1
-rw-r--r--fs/ext4/ext4.h5
-rw-r--r--fs/ext4/inode.c10
-rw-r--r--fs/ext4/ioctl.c24
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c99
-rw-r--r--fs/ext4/resize.c5
-rw-r--r--fs/ext4/super.c125
-rw-r--r--fs/fuse/file.c82
-rw-r--r--fs/gfs2/bmap.c11
-rw-r--r--fs/gfs2/export.c46
-rw-r--r--fs/gfs2/glock.c92
-rw-r--r--fs/gfs2/glock.h28
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/incore.h12
-rw-r--r--fs/gfs2/inode.c161
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/lock_dlm.c15
-rw-r--r--fs/gfs2/ops_inode.c18
-rw-r--r--fs/gfs2/quota.c28
-rw-r--r--fs/gfs2/rgrp.c146
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/xattr.c23
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/ioctl.c40
-rw-r--r--fs/ioprio.c13
-rw-r--r--fs/jbd2/journal.c16
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c1
-rw-r--r--fs/lockd/host.c11
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/locks.c20
-rw-r--r--fs/logfs/journal.c2
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/logfs/readwrite.c3
-rw-r--r--fs/namei.c3
-rw-r--r--fs/namespace.c1
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/ncpfs/file.c1
-rw-r--r--fs/ncpfs/inode.c1
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/nfs/callback.c1
-rw-r--r--fs/nfs/delegation.c1
-rw-r--r--fs/nfs/dir.c220
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/internal.h9
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/nfs2xdr.c8
-rw-r--r--fs/nfs/nfs3xdr.c8
-rw-r--r--fs/nfs/nfs4proc.c13
-rw-r--r--fs/nfs/nfs4xdr.c8
-rw-r--r--fs/nfs/pagelist.c4
-rw-r--r--fs/nfs/read.c1
-rw-r--r--fs/nfs/super.c13
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfsd/nfs3xdr.c6
-rw-r--r--fs/nfsd/nfs4state.c24
-rw-r--r--fs/nfsd/xdr4.h21
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/gcinode.c9
-rw-r--r--fs/nilfs2/ioctl.c16
-rw-r--r--fs/notify/fanotify/fanotify.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c81
-rw-r--r--fs/notify/inotify/inotify_user.c1
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/aops.h23
-rw-r--r--fs/ocfs2/cluster/heartbeat.c14
-rw-r--r--fs/ocfs2/cluster/masklog.c3
-rw-r--r--fs/ocfs2/cluster/masklog.h15
-rw-r--r--fs/ocfs2/dcache.c1
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/file.c15
-rw-r--r--fs/ocfs2/ocfs2.h6
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/pipe.c14
-rw-r--r--fs/proc/base.c81
-rw-r--r--fs/proc/inode.c1
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/read_write.c1
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/ioctl.c8
-rw-r--r--fs/reiserfs/journal.c1
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/reiserfs/xattr_acl.c6
-rw-r--r--fs/splice.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c101
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c37
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c1
-rw-r--r--fs/xfs/xfs_bmap.c85
-rw-r--r--fs/xfs/xfs_bmap.h5
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_error.h5
-rw-r--r--fs/xfs/xfs_filestream.c8
-rw-r--r--fs/xfs/xfs_inode_item.c31
-rw-r--r--fs/xfs/xfs_mount.c1
-rw-r--r--fs/xfs/xfs_quota.h20
-rw-r--r--fs/xfs/xfs_rename.c1
168 files changed, 2687 insertions, 1697 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d5c1401f003..d34896cfb19 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -980,19 +980,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
}
}
-static DEFINE_MUTEX(autofs4_ioctl_mutex);
-
static long autofs4_root_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg)
{
- long ret;
struct inode *inode = filp->f_dentry->d_inode;
-
- mutex_lock(&autofs4_ioctl_mutex);
- ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
- mutex_unlock(&autofs4_ioctl_mutex);
-
- return ret;
+ return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
}
#ifdef CONFIG_COMPAT
@@ -1002,13 +994,11 @@ static long autofs4_root_compat_ioctl(struct file *filp,
struct inode *inode = filp->f_path.dentry->d_inode;
int ret;
- mutex_lock(&autofs4_ioctl_mutex);
if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
else
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
(unsigned long)compat_ptr(arg));
- mutex_unlock(&autofs4_ioctl_mutex);
return ret;
}
diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7..4bd454fa844 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
{
struct bio *bio;
+ if (nr_iovecs > UIO_MAXIOV)
+ return NULL;
+
bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
gfp_mask);
if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
gfp_t gfp_mask)
{
- struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
+ struct bio_map_data *bmd;
+ if (iov_count > UIO_MAXIOV)
+ return NULL;
+
+ bmd = kmalloc(sizeof(*bmd), gfp_mask);
if (!bmd)
return NULL;
@@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = uaddr >> PAGE_SHIFT;
+ /*
+ * Overflow, abort
+ */
+ if (end < start)
+ return ERR_PTR(-EINVAL);
+
nr_pages += end - start;
len += iov[i].iov_len;
}
@@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = uaddr >> PAGE_SHIFT;
+ /*
+ * Overflow, abort
+ */
+ if (end < start)
+ return ERR_PTR(-EINVAL);
+
nr_pages += end - start;
/*
* buffer must be aligned to at least hardsector size for now
@@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
unsigned long start = uaddr >> PAGE_SHIFT;
const int local_nr_pages = end - start;
const int page_limit = cur_page + local_nr_pages;
-
+
ret = get_user_pages_fast(uaddr, local_nr_pages,
write_to_vm, &pages[cur_page]);
if (ret < local_nr_pages) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 06e8ff12b97..4230252fd68 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -11,7 +11,6 @@
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
-#include <linux/smp_lock.h>
#include <linux/device_cgroup.h>
#include <linux/highmem.h>
#include <linux/blkdev.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7845d1f7d1d..b50bc4bd5c5 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,23 +91,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
static struct bio *compressed_bio_alloc(struct block_device *bdev,
u64 first_byte, gfp_t gfp_flags)
{
- struct bio *bio;
int nr_vecs;
nr_vecs = bio_get_nr_vecs(bdev);
- bio = bio_alloc(gfp_flags, nr_vecs);
-
- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(gfp_flags, nr_vecs);
- }
-
- if (bio) {
- bio->bi_size = 0;
- bio->bi_bdev = bdev;
- bio->bi_sector = first_byte >> 9;
- }
- return bio;
+ return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
}
static int check_compressed_csum(struct inode *inode,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8db9234f6b4..af52f6d7a4d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -808,9 +808,9 @@ struct btrfs_block_group_cache {
int extents_thresh;
int free_extents;
int total_bitmaps;
- int ro:1;
- int dirty:1;
- int iref:1;
+ unsigned int ro:1;
+ unsigned int dirty:1;
+ unsigned int iref:1;
int disk_cache_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb827d0d718..51d2e4de34e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
#include <linux/freezer.h>
#include <linux/crc32c.h>
#include <linux/slab.h>
+#include <linux/migrate.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -355,6 +356,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
btrfs_header_generation(eb));
BUG_ON(ret);
+ WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
+
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
WARN_ON(1);
@@ -693,6 +696,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
__btree_submit_bio_done);
}
+#ifdef CONFIG_MIGRATION
+static int btree_migratepage(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ /*
+ * we can't safely write a btree page from here,
+ * we haven't done the locking hook
+ */
+ if (PageDirty(page))
+ return -EAGAIN;
+ /*
+ * Buffers may be managed in a filesystem specific way.
+ * We must have no buffers or drop them.
+ */
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL))
+ return -EAGAIN;
+ return migrate_page(mapping, newpage, page);
+}
+#endif
+
static int btree_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
@@ -707,8 +731,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
}
redirty_page_for_writepage(wbc, page);
- eb = btrfs_find_tree_block(root, page_offset(page),
- PAGE_CACHE_SIZE);
+ eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
WARN_ON(!eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -799,6 +822,9 @@ static const struct address_space_operations btree_aops = {
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
.sync_page = block_sync_page,
+#ifdef CONFIG_MIGRATION
+ .migratepage = btree_migratepage,
+#endif
};
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -981,7 +1007,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
- BUG_ON(!root->node);
+ if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
+ free_extent_buffer(root->node);
+ return -EIO;
+ }
root->commit_root = btrfs_root_node(root);
return 0;
}
@@ -1538,10 +1567,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
GFP_NOFS);
struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
- struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
- struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
- GFP_NOFS);
+ struct btrfs_root *tree_root = btrfs_sb(sb);
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09b82f..659f532d26a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -166,7 +166,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
static struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = child->d_inode;
- static struct dentry *dentry;
+ struct dentry *dentry;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -232,9 +232,85 @@ fail:
return ERR_PTR(ret);
}
+static int btrfs_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct inode *inode = child->d_inode;
+ struct inode *dir = parent->d_inode;
+ struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_root_ref *rref;
+ struct extent_buffer *leaf;
+ unsigned long name_ptr;
+ struct btrfs_key key;
+ int name_len;
+ int ret;
+
+ if (!dir || !inode)
+ return -EINVAL;
+
+ if (!S_ISDIR(dir->i_mode))
+ return -EINVAL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->leave_spinning = 1;
+
+ if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+ root = root->fs_info->tree_root;
+ } else {
+ key.objectid = inode->i_ino;
+ key.offset = dir->i_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ }
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ btrfs_free_path(path);
+ return ret;
+ } else if (ret > 0) {
+ if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ path->slots[0]--;
+ } else {
+ btrfs_free_path(path);
+ return -ENOENT;
+ }
+ }
+ leaf = path->nodes[0];
+
+ if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ rref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ name_ptr = (unsigned long)(rref + 1);
+ name_len = btrfs_root_ref_name_len(leaf, rref);
+ } else {
+ iref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_ref);
+ name_ptr = (unsigned long)(iref + 1);
+ name_len = btrfs_inode_ref_name_len(leaf, iref);
+ }
+
+ read_extent_buffer(leaf, name, name_ptr, name_len);
+ btrfs_free_path(path);
+
+ /*
+ * have to add the null termination to make sure that reconnect_path
+ * gets the right len for strlen
+ */
+ name[name_len] = '\0';
+
+ return 0;
+}
+
const struct export_operations btrfs_export_ops = {
.encode_fh = btrfs_encode_fh,
.fh_to_dentry = btrfs_fh_to_dentry,
.fh_to_parent = btrfs_fh_to_parent,
.get_parent = btrfs_get_parent,
+ .get_name = btrfs_get_name,
};
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0c097f3aec4..227e5815d83 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -429,6 +429,7 @@ err:
static int cache_block_group(struct btrfs_block_group_cache *cache,
struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
int load_cache_only)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
/*
* We can't do the read from on-disk cache during a commit since we need
- * to have the normal tree locking.
+ * to have the normal tree locking. Also if we are currently trying to
+ * allocate blocks for the tree root we can't do the fast caching since
+ * we likely hold important locks.
*/
- if (!trans->transaction->in_commit) {
+ if (!trans->transaction->in_commit &&
+ (root && root != root->fs_info->tree_root)) {
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
spin_unlock(&cache->lock);
@@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
struct btrfs_root *root = block_group->fs_info->tree_root;
struct inode *inode = NULL;
u64 alloc_hint = 0;
+ int dcs = BTRFS_DC_ERROR;
int num_pages = 0;
int retries = 0;
int ret = 0;
@@ -2795,6 +2800,8 @@ again:
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED) {
+ /* We're not cached, don't bother trying to write stuff out */
+ dcs = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
goto out_put;
}
@@ -2821,6 +2828,8 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
num_pages, num_pages,
&alloc_hint);
+ if (!ret)
+ dcs = BTRFS_DC_SETUP;
btrfs_free_reserved_data_space(inode, num_pages);
out_put:
iput(inode);
@@ -2828,10 +2837,7 @@ out_free:
btrfs_release_path(root, path);
out:
spin_lock(&block_group->lock);
- if (ret)
- block_group->disk_cache_state = BTRFS_DC_ERROR;
- else
- block_group->disk_cache_state = BTRFS_DC_SETUP;
+ block_group->disk_cache_state = dcs;
spin_unlock(&block_group->lock);
return ret;
@@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
- u64 num_devices = root->fs_info->fs_devices->rw_devices;
+ /*
+ * we add in the count of missing devices because we want
+ * to make sure that any RAID levels on a degraded FS
+ * continue to be honored.
+ */
+ u64 num_devices = root->fs_info->fs_devices->rw_devices +
+ root->fs_info->fs_devices->missing_devices;
if (num_devices == 1)
flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3412,7 +3424,7 @@ again:
* our reservation.
*/
if (unused <= space_info->total_bytes) {
- unused -= space_info->total_bytes;
+ unused = space_info->total_bytes - unused;
if (unused >= num_bytes) {
if (!reserved)
space_info->bytes_reserved += orig_bytes;
@@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && cache->cached == BTRFS_CACHE_NO)
- cache_block_group(cache, trans, 1);
+ cache_block_group(cache, trans, NULL, 1);
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -4930,11 +4942,31 @@ search:
btrfs_get_block_group(block_group);
search_start = block_group->key.objectid;
+ /*
+ * this can happen if we end up cycling through all the
+ * raid types, but we want to make sure we only allocate
+ * for the proper type.
+ */
+ if (!block_group_bits(block_group, data)) {
+ u64 extra = BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10;
+
+ /*
+ * if they asked for extra copies and this block group
+ * doesn't provide them, bail. This does allow us to
+ * fill raid0 from raid1.
+ */
+ if ((data & extra) && !(block_group->flags & extra))
+ goto loop;
+ }
+
have_block_group:
if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
u64 free_percent;
- ret = cache_block_group(block_group, trans, 1);
+ ret = cache_block_group(block_group, trans,
+ orig_root, 1);
if (block_group->cached == BTRFS_CACHE_FINISHED)
goto have_block_group;
@@ -4958,7 +4990,8 @@ have_block_group:
if (loop > LOOP_CACHING_NOWAIT ||
(loop > LOOP_FIND_IDEAL &&
atomic_read(&space_info->caching_threads) < 2)) {
- ret = cache_block_group(block_group, trans, 0);
+ ret = cache_block_group(block_group, trans,
+ orig_root, 0);
BUG_ON(ret);
}
found_uncached_bg = true;
@@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
u64 num_bytes = ins->offset;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- cache_block_group(block_group, trans, 0);
+ cache_block_group(block_group, trans, NULL, 0);
caching_ctl = get_caching_control(block_group);
if (!caching_ctl) {
@@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
NULL, NULL);
BUG_ON(ret < 0);
if (ret > 0) {
- ret = btrfs_del_orphan_item(trans, tree_root,
- root->root_key.objectid);
- BUG_ON(ret);
+ /* if we fail to delete the orphan item this time
+ * around, it'll get picked up the next time.
+ *
+ * The most common failure here is just -ENOENT.
+ */
+ btrfs_del_orphan_item(trans, tree_root,
+ root->root_key.objectid);
}
}
@@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
- num_devices = root->fs_info->fs_devices->rw_devices;
+ /*
+ * we add in the count of missing devices because we want
+ * to make sure that any RAID levels on a degraded FS
+ * continue to be honored.
+ */
+ num_devices = root->fs_info->fs_devices->rw_devices +
+ root->fs_info->fs_devices->missing_devices;
+
if (num_devices == 1) {
stripped |= BTRFS_BLOCK_GROUP_DUP;
stripped = flags & ~stripped;
@@ -8247,7 +8291,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
break;
if (ret != 0)
goto error;
-
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
cache = kzalloc(sizeof(*cache), GFP_NOFS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eac10e3260a..3e86b9f3650 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
bio_put(bio);
}
-static struct bio *
-extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
- gfp_t gfp_flags)
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+ gfp_t gfp_flags)
{
struct bio *bio;
@@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
else
nr = bio_get_nr_vecs(bdev);
- bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+ bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
@@ -2901,21 +2901,53 @@ out:
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent)
{
- int ret;
+ int ret = 0;
u64 off = start;
u64 max = start + len;
u32 flags = 0;
+ u32 found_type;
+ u64 last;
u64 disko = 0;
+ struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *item;
int end = 0;
u64 em_start = 0, em_len = 0;
unsigned long emflags;
- ret = 0;
+ int hole = 0;
if (len == 0)
return -EINVAL;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->leave_spinning = 1;
+
+ ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
+ path, inode->i_ino, -1, 0);
+ if (ret < 0) {
+ btrfs_free_path(path);
+ return ret;
+ }
+ WARN_ON(!ret);
+ path->slots[0]--;
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+ found_type = btrfs_key_type(&found_key);
+
+ /* No extents, just return */
+ if (found_key.objectid != inode->i_ino ||
+ found_type != BTRFS_EXTENT_DATA_KEY) {
+ btrfs_free_path(path);
+ return 0;
+ }
+ last = found_key.offset;
+ btrfs_free_path(path);
+
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
&cached_state, GFP_NOFS);
em = get_extent(inode, NULL, 0, off, max - off, 0);
@@ -2925,11 +2957,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
ret = PTR_ERR(em);
goto out;
}
+
while (!end) {
+ hole = 0;
off = em->start + em->len;
if (off >= max)
end = 1;
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ hole = 1;
+ goto next;
+ }
+
em_start = em->start;
em_len = em->len;
@@ -2939,8 +2978,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (em->block_start == EXTENT_MAP_LAST_BYTE) {
end = 1;
flags |= FIEMAP_EXTENT_LAST;
- } else if (em->block_start == EXTENT_MAP_HOLE) {
- flags |= FIEMAP_EXTENT_UNWRITTEN;
} else if (em->block_start == EXTENT_MAP_INLINE) {
flags |= (FIEMAP_EXTENT_DATA_INLINE |
FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2953,10 +2990,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
+next:
emflags = em->flags;
free_extent_map(em);
em = NULL;
-
if (!end) {
em = get_extent(inode, NULL, 0, off, max - off, 0);
if (!em)
@@ -2967,15 +3004,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
emflags = em->flags;
}
+
if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
flags |= FIEMAP_EXTENT_LAST;
end = 1;
}
- ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
- em_len, flags);
- if (ret)
- goto out_free;
+ if (em_start == last) {
+ flags |= FIEMAP_EXTENT_LAST;
+ end = 1;
+ }
+
+ if (!hole) {
+ ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+ em_len, flags);
+ if (ret)
+ goto out_free;
+ }
}
out_free:
free_extent_map(em);
@@ -3836,8 +3881,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
spin_lock(&tree->buffer_lock);
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
- if (!eb)
- goto out;
+ if (!eb) {
+ spin_unlock(&tree->buffer_lock);
+ return ret;
+ }
if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
ret = 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1c6d4f342ef..4183c8178f0 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -310,4 +310,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
struct extent_io_tree *tree,
u64 start, u64 end, struct page *locked_page,
unsigned long op);
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+ gfp_t gfp_flags);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df08..66836d85763 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -48,30 +48,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
struct page **prepared_pages,
struct iov_iter *i)
{
- size_t copied;
+ size_t copied = 0;
int pg = 0;
int offset = pos & (PAGE_CACHE_SIZE - 1);
+ int total_copied = 0;
while (write_bytes > 0) {
size_t count = min_t(size_t,
PAGE_CACHE_SIZE - offset, write_bytes);
struct page *page = prepared_pages[pg];
-again:
- if (unlikely(iov_iter_fault_in_readable(i, count)))
- return -EFAULT;
-
- /* Copy data from userspace to the current page */
- copied = iov_iter_copy_from_user(page, i, offset, count);
+ /*
+ * Copy data from userspace to the current page
+ *
+ * Disable pagefault to avoid recursive lock since
+ * the pages are already locked
+ */
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+ pagefault_enable();
/* Flush processor's dcache for this page */
flush_dcache_page(page);
iov_iter_advance(i, copied);
write_bytes -= copied;
+ total_copied += copied;
+ /* Return to btrfs_file_aio_write to fault page */
if (unlikely(copied == 0)) {
- count = min_t(size_t, PAGE_CACHE_SIZE - offset,
- iov_iter_single_seg_count(i));
- goto again;
+ break;
}
if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
@@ -81,7 +85,7 @@ again:
offset = 0;
}
}
- return 0;
+ return total_copied;
}
/*
@@ -854,6 +858,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
unsigned long last_index;
int will_write;
int buffered = 0;
+ int copied = 0;
+ int dirty_pages = 0;
will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
(file->f_flags & O_DIRECT));
@@ -970,7 +976,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
WARN_ON(num_pages > nrptrs);
memset(pages, 0, sizeof(struct page *) * nrptrs);
- ret = btrfs_delalloc_reserve_space(inode, write_bytes);
+ /*
+ * Fault pages before locking them in prepare_pages
+ * to avoid recursive lock
+ */
+ if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = btrfs_delalloc_reserve_space(inode,
+ num_pages << PAGE_CACHE_SHIFT);
if (ret)
goto out;
@@ -978,37 +994,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
pos, first_index, last_index,
write_bytes);
if (ret) {
- btrfs_delalloc_release_space(inode, write_bytes);
+ btrfs_delalloc_release_space(inode,
+ num_pages << PAGE_CACHE_SHIFT);
goto out;
}
- ret = btrfs_copy_from_user(pos, num_pages,
+ copied = btrfs_copy_from_user(pos, num_pages,
write_bytes, pages, &i);
- if (ret == 0) {
+ dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if (num_pages > dirty_pages) {
+ if (copied > 0)
+ atomic_inc(
+ &BTRFS_I(inode)->outstanding_extents);
+ btrfs_delalloc_release_space(inode,
+ (num_pages - dirty_pages) <<
+ PAGE_CACHE_SHIFT);
+ }
+
+ if (copied > 0) {
dirty_and_release_pages(NULL, root, file, pages,
- num_pages, pos, write_bytes);
+ dirty_pages, pos, copied);
}
btrfs_drop_pages(pages, num_pages);
- if (ret) {
- btrfs_delalloc_release_space(inode, write_bytes);
- goto out;
- }
- if (will_write) {
- filemap_fdatawrite_range(inode->i_mapping, pos,
- pos + write_bytes - 1);
- } else {
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- num_pages);
- if (num_pages <
- (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
- btrfs_btree_balance_dirty(root, 1);
- btrfs_throttle(root);
+ if (copied > 0) {
+ if (will_write) {
+ filemap_fdatawrite_range(inode->i_mapping, pos,
+ pos + copied - 1);
+ } else {
+ balance_dirty_pages_ratelimited_nr(
+ inode->i_mapping,
+ dirty_pages);
+ if (dirty_pages <
+ (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ btrfs_btree_balance_dirty(root, 1);
+ btrfs_throttle(root);
+ }
}
- pos += write_bytes;
- num_written += write_bytes;
+ pos += copied;
+ num_written += copied;
cond_resched();
}
@@ -1047,8 +1075,14 @@ out:
if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ num_written = PTR_ERR(trans);
+ goto done;
+ }
+ mutex_lock(&inode->i_mutex);
ret = btrfs_log_dentry_safe(trans, root,
file->f_dentry);
+ mutex_unlock(&inode->i_mutex);
if (ret == 0) {
ret = btrfs_sync_log(trans, root);
if (ret == 0)
@@ -1067,6 +1101,7 @@ out:
(start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
}
}
+done:
current->backing_dev_info = NULL;
return num_written ? num_written : err;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 22ee0dc2e6b..60d68426695 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
(unsigned long long)BTRFS_I(inode)->generation,
(unsigned long long)generation,
(unsigned long long)block_group->key.objectid);
- goto out;
+ goto free_cache;
}
if (!num_entries)
@@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
return 0;
}
+ node = rb_first(&block_group->free_space_offset);
+ if (!node) {
+ iput(inode);
+ return 0;
+ }
+
last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
filemap_write_and_wait(inode->i_mapping);
btrfs_wait_ordered_range(inode, inode->i_size &
@@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
*/
first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
- node = rb_first(&block_group->free_space_offset);
- if (!node)
- goto out_free;
-
/*
* Lock all pages first so we can lock the extent safely.
*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 558cac2dfa5..72f31ecb5c9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -495,7 +495,7 @@ again:
add_async_extent(async_cow, start, num_bytes,
total_compressed, pages, nr_pages_ret);
- if (start + num_bytes < end && start + num_bytes < actual_end) {
+ if (start + num_bytes < end) {
start += num_bytes;
pages = NULL;
cond_resched();
@@ -4501,6 +4501,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->index_cnt = 2;
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->generation = trans->transid;
+ inode->i_generation = BTRFS_I(inode)->generation;
btrfs_set_inode_space_info(root, inode);
if (mode & S_IFDIR)
@@ -4622,12 +4623,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
}
static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
- struct dentry *dentry, struct inode *inode,
- int backref, u64 index)
+ struct inode *dir, struct dentry *dentry,
+ struct inode *inode, int backref, u64 index)
{
- int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
- inode, dentry->d_name.name,
- dentry->d_name.len, backref, index);
+ int err = btrfs_add_link(trans, dir, inode,
+ dentry->d_name.name, dentry->d_name.len,
+ backref, index);
if (!err) {
d_instantiate(dentry, inode);
return 0;
@@ -4668,8 +4669,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
btrfs_set_trans_block_group(trans, dir);
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len,
- dentry->d_parent->d_inode->i_ino, objectid,
+ dentry->d_name.len, dir->i_ino, objectid,
BTRFS_I(dir)->block_group, mode, &index);
err = PTR_ERR(inode);
if (IS_ERR(inode))
@@ -4682,7 +4682,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
}
btrfs_set_trans_block_group(trans, inode);
- err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
drop_inode = 1;
else {
@@ -4730,10 +4730,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
btrfs_set_trans_block_group(trans, dir);
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len,
- dentry->d_parent->d_inode->i_ino,
- objectid, BTRFS_I(dir)->block_group, mode,
- &index);
+ dentry->d_name.len, dir->i_ino, objectid,
+ BTRFS_I(dir)->block_group, mode, &index);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_unlock;
@@ -4745,7 +4743,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
}
btrfs_set_trans_block_group(trans, inode);
- err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
drop_inode = 1;
else {
@@ -4787,6 +4785,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
return -EPERM;
btrfs_inc_nlink(inode);
+ inode->i_ctime = CURRENT_TIME;
err = btrfs_set_inode_index(dir, &index);
if (err)
@@ -4805,15 +4804,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_set_trans_block_group(trans, dir);
ihold(inode);
- err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
if (err) {
drop_inode = 1;
} else {
+ struct dentry *parent = dget_parent(dentry);
btrfs_update_inode_block_group(trans, dir);
err = btrfs_update_inode(trans, root, inode);
BUG_ON(err);
- btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+ btrfs_log_new_name(trans, inode, NULL, parent);
+ dput(parent);
}
nr = trans->blocks_used;
@@ -4853,8 +4854,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
btrfs_set_trans_block_group(trans, dir);
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len,
- dentry->d_parent->d_inode->i_ino, objectid,
+ dentry->d_name.len, dir->i_ino, objectid,
BTRFS_I(dir)->block_group, S_IFDIR | mode,
&index);
if (IS_ERR(inode)) {
@@ -4877,9 +4877,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (err)
goto out_fail;
- err = btrfs_add_link(trans, dentry->d_parent->d_inode,
- inode, dentry->d_name.name,
- dentry->d_name.len, 0, index);
+ err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
+ dentry->d_name.len, 0, index);
if (err)
goto out_fail;
@@ -5535,13 +5534,21 @@ struct btrfs_dio_private {
u64 bytes;
u32 *csums;
void *private;
+
+ /* number of bios pending for this dio */
+ atomic_t pending_bios;
+
+ /* IO errors */
+ int errors;
+
+ struct bio *orig_bio;
};
static void btrfs_endio_direct_read(struct bio *bio, int err)
{
+ struct btrfs_dio_private *dip = bio->bi_private;
struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
struct bio_vec *bvec = bio->bi_io_vec;
- struct btrfs_dio_private *dip = bio->bi_private;
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start;
@@ -5595,15 +5602,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
struct btrfs_trans_handle *trans;
struct btrfs_ordered_extent *ordered = NULL;
struct extent_state *cached_state = NULL;
+ u64 ordered_offset = dip->logical_offset;
+ u64 ordered_bytes = dip->bytes;
int ret;
if (err)
goto out_done;
-
- ret = btrfs_dec_test_ordered_pending(inode, &ordered,
- dip->logical_offset, dip->bytes);
+again:
+ ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
+ &ordered_offset,
+ ordered_bytes);
if (!ret)
- goto out_done;
+ goto out_test;
BUG_ON(!ordered);
@@ -5663,8 +5673,20 @@ out_unlock:
out:
btrfs_delalloc_release_metadata(inode, ordered->len);
btrfs_end_transaction(trans, root);
+ ordered_offset = ordered->file_offset + ordered->len;
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
+
+out_test:
+ /*
+ * our bio might span multiple ordered extents. If we haven't
+ * completed the accounting for the whole dio, go back and try again
+ */
+ if (ordered_offset < dip->logical_offset + dip->bytes) {
+ ordered_bytes = dip->logical_offset + dip->bytes -
+ ordered_offset;
+ goto again;
+ }
out_done:
bio->bi_private = dip->private;
@@ -5684,6 +5706,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
return 0;
}
+static void btrfs_end_dio_bio(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+
+ if (err) {
+ printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
+ "sector %#Lx len %u err no %d\n",
+ dip->inode->i_ino, bio->bi_rw,
+ (unsigned long long)bio->bi_sector, bio->bi_size, err);
+ dip->errors = 1;
+
+ /*
+ * before atomic variable goto zero, we must make sure
+ * dip->errors is perceived to be set.
+ */
+ smp_mb__before_atomic_dec();
+ }
+
+ /* if there are more bios still pending for this dio, just exit */
+ if (!atomic_dec_and_test(&dip->pending_bios))
+ goto out;
+
+ if (dip->errors)
+ bio_io_error(dip->orig_bio);
+ else {
+ set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+ bio_endio(dip->orig_bio, 0);
+ }
+out:
+ bio_put(bio);
+}
+
+static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
+ u64 first_sector, gfp_t gfp_flags)
+{
+ int nr_vecs = bio_get_nr_vecs(bdev);
+ return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+}
+
+static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+ int rw, u64 file_offset, int skip_sum,
+ u32 *csums)
+{
+ int write = rw & REQ_WRITE;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ bio_get(bio);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ if (ret)
+ goto err;
+
+ if (write && !skip_sum) {
+ ret = btrfs_wq_submit_bio(root->fs_info,
+ inode, rw, bio, 0, 0,
+ file_offset,
+ __btrfs_submit_bio_start_direct_io,
+ __btrfs_submit_bio_done);
+ goto err;
+ } else if (!skip_sum)
+ btrfs_lookup_bio_sums_dio(root, inode, bio,
+ file_offset, csums);
+
+ ret = btrfs_map_bio(root, rw, bio, 0, 1);
+err:
+ bio_put(bio);
+ return ret;
+}
+
+static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
+ int skip_sum)
+{
+ struct inode *inode = dip->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ struct bio *bio;
+ struct bio *orig_bio = dip->orig_bio;
+ struct bio_vec *bvec = orig_bio->bi_io_vec;
+ u64 start_sector = orig_bio->bi_sector;
+ u64 file_offset = dip->logical_offset;
+ u64 submit_len = 0;
+ u64 map_length;
+ int nr_pages = 0;
+ u32 *csums = dip->csums;
+ int ret = 0;
+
+ bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+ if (!bio)
+ return -ENOMEM;
+ bio->bi_private = dip;
+ bio->bi_end_io = btrfs_end_dio_bio;
+ atomic_inc(&dip->pending_bios);
+
+ map_length = orig_bio->bi_size;
+ ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+ &map_length, NULL, 0);
+ if (ret) {
+ bio_put(bio);
+ return -EIO;
+ }
+
+ while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
+ if (unlikely(map_length < submit_len + bvec->bv_len ||
+ bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+ bvec->bv_offset) < bvec->bv_len)) {
+ /*
+ * inc the count before we submit the bio so
+ * we know the end IO handler won't happen before
+ * we inc the count. Otherwise, the dip might get freed
+ * before we're done setting it up
+ */
+ atomic_inc(&dip->pending_bios);
+ ret = __btrfs_submit_dio_bio(bio, inode, rw,
+ file_offset, skip_sum,
+ csums);
+ if (ret) {
+ bio_put(bio);
+ atomic_dec(&dip->pending_bios);
+ goto out_err;
+ }
+
+ if (!skip_sum)
+ csums = csums + nr_pages;
+ start_sector += submit_len >> 9;
+ file_offset += submit_len;
+
+ submit_len = 0;
+ nr_pages = 0;
+
+ bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
+ start_sector, GFP_NOFS);
+ if (!bio)
+ goto out_err;
+ bio->bi_private = dip;
+ bio->bi_end_io = btrfs_end_dio_bio;
+
+ map_length = orig_bio->bi_size;
+ ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+ &map_length, NULL, 0);
+ if (ret) {
+ bio_put(bio);
+ goto out_err;
+ }
+ } else {
+ submit_len += bvec->bv_len;
+ nr_pages ++;
+ bvec++;
+ }
+ }
+
+ ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
+ csums);
+ if (!ret)
+ return 0;
+
+ bio_put(bio);
+out_err:
+ dip->errors = 1;
+ /*
+ * before atomic variable goto zero, we must
+ * make sure dip->errors is perceived to be set.
+ */
+ smp_mb__before_atomic_dec();
+ if (atomic_dec_and_test(&dip->pending_bios))
+ bio_io_error(dip->orig_bio);
+
+ /* bio_end_io() will handle error, so we needn't return it */
+ return 0;
+}
+
static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
loff_t file_offset)
{
@@ -5723,36 +5915,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
dip->disk_bytenr = (u64)bio->bi_sector << 9;
bio->bi_private = dip;
+ dip->errors = 0;
+ dip->orig_bio = bio;
+ atomic_set(&dip->pending_bios, 0);
if (write)
bio->bi_end_io = btrfs_endio_direct_write;
else
bio->bi_end_io = btrfs_endio_direct_read;
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- if (ret)
- goto out_err;
-
- if (write && !skip_sum) {
- ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, rw, bio, 0, 0,
- dip->logical_offset,
- __btrfs_submit_bio_start_direct_io,
- __btrfs_submit_bio_done);
- if (ret)
- goto out_err;
+ ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
+ if (!ret)
return;
- } else if (!skip_sum)
- btrfs_lookup_bio_sums_dio(root, inode, bio,
- dip->logical_offset, dip->csums);
-
- ret = btrfs_map_bio(root, rw, bio, 0, 1);
- if (ret)
- goto out_err;
- return;
-out_err:
- kfree(dip->csums);
- kfree(dip);
free_ordered:
/*
* If this is a write, we need to clean up the reserved space and kill
@@ -5760,8 +5934,7 @@ free_ordered:
*/
if (write) {
struct btrfs_ordered_extent *ordered;
- ordered = btrfs_lookup_ordered_extent(inode,
- dip->logical_offset);
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
btrfs_free_reserved_extent(root, ordered->start,
@@ -6607,8 +6780,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BUG_ON(ret);
if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
- btrfs_log_new_name(trans, old_inode, old_dir,
- new_dentry->d_parent);
+ struct dentry *parent = dget_parent(new_dentry);
+ btrfs_log_new_name(trans, old_inode, old_dir, parent);
+ dput(parent);
btrfs_end_log_trans(root);
}
out_fail:
@@ -6758,8 +6932,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
btrfs_set_trans_block_group(trans, dir);
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len,
- dentry->d_parent->d_inode->i_ino, objectid,
+ dentry->d_name.len, dir->i_ino, objectid,
BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
&index);
err = PTR_ERR(inode);
@@ -6773,7 +6946,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
}
btrfs_set_trans_block_group(trans, inode);
- err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
drop_inode = 1;
else {
@@ -6844,6 +7017,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
+ u64 i_size;
int ret = 0;
bool own_trans = true;
@@ -6885,11 +7059,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
(actual_len > inode->i_size) &&
(cur_offset > inode->i_size)) {
if (cur_offset > actual_len)
- i_size_write(inode, actual_len);
+ i_size = actual_len;
else
- i_size_write(inode, cur_offset);
- i_size_write(inode, cur_offset);
- btrfs_ordered_update_i_size(inode, cur_offset, NULL);
+ i_size = cur_offset;
+ i_size_write(inode, i_size);
+ btrfs_ordered_update_i_size(inode, i_size, NULL);
}
ret = btrfs_update_inode(trans, root, inode);
@@ -6943,6 +7117,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, alloc_end);
+ if (ret)
+ goto out;
+
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, alloc_start);
if (ret)
@@ -7139,6 +7317,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
+ .getattr = btrfs_getattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
.getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 463d91b4dd3..f87552a1d7e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -233,7 +233,8 @@ static noinline int create_subvol(struct btrfs_root *root,
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *new_root;
- struct inode *dir = dentry->d_parent->d_inode;
+ struct dentry *parent = dget_parent(dentry);
+ struct inode *dir;
int ret;
int err;
u64 objectid;
@@ -242,8 +243,13 @@ static noinline int create_subvol(struct btrfs_root *root,
ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
0, &objectid);
- if (ret)
+ if (ret) {
+ dput(parent);
return ret;
+ }
+
+ dir = parent->d_inode;
+
/*
* 1 - inode item
* 2 - refs
@@ -251,8 +257,10 @@ static noinline int create_subvol(struct btrfs_root *root,
* 2 - dir items
*/
trans = btrfs_start_transaction(root, 6);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ dput(parent);
return PTR_ERR(trans);
+ }
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
0, objectid, NULL, 0, 0, 0);
@@ -339,6 +347,7 @@ static noinline int create_subvol(struct btrfs_root *root,
d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
+ dput(parent);
if (async_transid) {
*async_transid = trans->transid;
err = btrfs_commit_transaction_async(trans, root, 1);
@@ -354,6 +363,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
char *name, int namelen, u64 *async_transid)
{
struct inode *inode;
+ struct dentry *parent;
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
int ret;
@@ -396,7 +406,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
btrfs_orphan_cleanup(pending_snapshot->snap);
- inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+ parent = dget_parent(dentry);
+ inode = btrfs_lookup_dentry(parent->d_inode, dentry);
+ dput(parent);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto fail;
@@ -935,23 +947,42 @@ out:
static noinline int btrfs_ioctl_snap_create(struct file *file,
void __user *arg, int subvol,
- int async)
+ int v2)
{
struct btrfs_ioctl_vol_args *vol_args = NULL;
- struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
+ struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
char *name;
u64 fd;
- u64 transid = 0;
int ret;
- if (async) {
- async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
- if (IS_ERR(async_vol_args))
- return PTR_ERR(async_vol_args);
+ if (v2) {
+ u64 transid = 0;
+ u64 *ptr = NULL;
+
+ vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
+ if (IS_ERR(vol_args_v2))
+ return PTR_ERR(vol_args_v2);
+
+ if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ name = vol_args_v2->name;
+ fd = vol_args_v2->fd;
+ vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+
+ if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+ ptr = &transid;
+
+ ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+ subvol, ptr);
- name = async_vol_args->name;
- fd = async_vol_args->fd;
- async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+ if (ret == 0 && ptr &&
+ copy_to_user(arg +
+ offsetof(struct btrfs_ioctl_vol_args_v2,
+ transid), ptr, sizeof(*ptr)))
+ ret = -EFAULT;
} else {
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
@@ -959,20 +990,13 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
name = vol_args->name;
fd = vol_args->fd;
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- }
-
- ret = btrfs_ioctl_snap_create_transid(file, name, fd,
- subvol, &transid);
- if (!ret && async) {
- if (copy_to_user(arg +
- offsetof(struct btrfs_ioctl_async_vol_args,
- transid), &transid, sizeof(transid)))
- return -EFAULT;
+ ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+ subvol, NULL);
}
-
+out:
kfree(vol_args);
- kfree(async_vol_args);
+ kfree(vol_args_v2);
return ret;
}
@@ -1669,12 +1693,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
olen = len = src->i_size - off;
/* if we extend to eof, continue to block boundary */
if (off + len == src->i_size)
- len = ((src->i_size + bs-1) & ~(bs-1))
- - off;
+ len = ALIGN(src->i_size, bs) - off;
/* verify the end result is block aligned */
- if ((off & (bs-1)) ||
- ((off + len) & (bs-1)))
+ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
+ !IS_ALIGNED(destoff, bs))
goto out_unlock;
/* do any pending delalloc/csum calc on src, one way or
@@ -1874,8 +1897,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* but shouldn't round up the file size
*/
endoff = new_key.offset + datal;
- if (endoff > off+olen)
- endoff = off+olen;
+ if (endoff > destoff+olen)
+ endoff = destoff+olen;
if (endoff > inode->i_size)
btrfs_i_size_write(inode, endoff);
@@ -2235,7 +2258,7 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_getversion(file, argp);
case BTRFS_IOC_SNAP_CREATE:
return btrfs_ioctl_snap_create(file, argp, 0, 0);
- case BTRFS_IOC_SNAP_CREATE_ASYNC:
+ case BTRFS_IOC_SNAP_CREATE_V2:
return btrfs_ioctl_snap_create(file, argp, 0, 1);
case BTRFS_IOC_SUBVOL_CREATE:
return btrfs_ioctl_snap_create(file, argp, 1, 0);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 17c99ebdf96..c344d12c646 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,11 +30,15 @@ struct btrfs_ioctl_vol_args {
char name[BTRFS_PATH_NAME_MAX + 1];
};
-#define BTRFS_SNAPSHOT_NAME_MAX 4079
-struct btrfs_ioctl_async_vol_args {
+#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
+
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
__s64 fd;
__u64 transid;
- char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
+ __u64 flags;
+ __u64 unused[4];
+ char name[BTRFS_SUBVOL_NAME_MAX + 1];
};
#define BTRFS_INO_LOOKUP_PATH_MAX 4080
@@ -187,6 +191,6 @@ struct btrfs_ioctl_space_args {
struct btrfs_ioctl_space_args)
#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
- struct btrfs_ioctl_async_vol_args)
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
+ struct btrfs_ioctl_vol_args_v2)
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f4621f6deca..ae7737e352c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -250,6 +250,73 @@ int btrfs_add_ordered_sum(struct inode *inode,
/*
* this is used to account for finished IO across a given range
+ * of the file. The IO may span ordered extents. If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ *
+ * file_offset is updated to one byte past the range that is recorded as
+ * complete. This allows you to walk forward in the file.
+ */
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 *file_offset, u64 io_size)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+ int ret;
+ u64 dec_end;
+ u64 dec_start;
+ u64 to_dec;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock(&tree->lock);
+ node = tree_search(tree, *file_offset);
+ if (!node) {
+ ret = 1;
+ goto out;
+ }
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (!offset_in_entry(entry, *file_offset)) {
+ ret = 1;
+ goto out;
+ }
+
+ dec_start = max(*file_offset, entry->file_offset);
+ dec_end = min(*file_offset + io_size, entry->file_offset +
+ entry->len);
+ *file_offset = dec_end;
+ if (dec_start > dec_end) {
+ printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
+ (unsigned long long)dec_start,
+ (unsigned long long)dec_end);
+ }
+ to_dec = dec_end - dec_start;
+ if (to_dec > entry->bytes_left) {
+ printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+ (unsigned long long)entry->bytes_left,
+ (unsigned long long)to_dec);
+ }
+ entry->bytes_left -= to_dec;
+ if (entry->bytes_left == 0)
+ ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ else
+ ret = 1;
+out:
+ if (!ret && cached && entry) {
+ *cached = entry;
+ atomic_inc(&entry->refs);
+ }
+ spin_unlock(&tree->lock);
+ return ret == 0;
+}
+
+/*
+ * this is used to account for finished IO across a given range
* of the file. The IO should not span ordered extents. If
* a given ordered_extent is completely done, 1 is returned, otherwise
* 0.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8ac365492a3..61dca83119d 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -141,6 +141,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 *file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28..f8be250963a 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret)
+ if (ret < 0)
goto out;
+ if (ret) {
+ ret = -ENOENT;
+ goto out;
+ }
ret = btrfs_del_item(trans, root, path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8299a25ffc8..883c6fa1367 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -244,6 +244,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_space_cache:
printk(KERN_INFO "btrfs: enabling disk space caching\n");
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ break;
case Opt_clear_cache:
printk(KERN_INFO "btrfs: force clearing of disk cache\n");
btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
@@ -562,12 +563,26 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
static int btrfs_test_super(struct super_block *s, void *data)
{
- struct btrfs_fs_devices *test_fs_devices = data;
+ struct btrfs_root *test_root = data;
struct btrfs_root *root = btrfs_sb(s);
- return root->fs_info->fs_devices == test_fs_devices;
+ /*
+ * If this super block is going away, return false as it
+ * can't match as an existing super block.
+ */
+ if (!atomic_read(&s->s_active))
+ return 0;
+ return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+}
+
+static int btrfs_set_super(struct super_block *s, void *data)
+{
+ s->s_fs_info = data;
+
+ return set_anon_super(s, data);
}
+
/*
* Find a superblock for the given device / mount point.
*
@@ -581,6 +596,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
struct super_block *s;
struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
+ struct btrfs_root *tree_root = NULL;
+ struct btrfs_fs_info *fs_info = NULL;
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
@@ -608,8 +625,24 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
goto error_close_devices;
}
+ /*
+ * Setup a dummy root and fs_info for test/set super. This is because
+ * we don't actually fill this stuff out until open_ctree, but we need
+ * it for searching for existing supers, so this lets us do that and
+ * then open_ctree will properly initialize everything later.
+ */
+ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+ tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ if (!fs_info || !tree_root) {
+ error = -ENOMEM;
+ goto error_close_devices;
+ }
+ fs_info->tree_root = tree_root;
+ fs_info->fs_devices = fs_devices;
+ tree_root->fs_info = fs_info;
+
bdev = fs_devices->latest_bdev;
- s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+ s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
if (IS_ERR(s))
goto error_s;
@@ -652,9 +685,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
mutex_unlock(&root->d_inode->i_mutex);
if (IS_ERR(new_root)) {
+ dput(root);
deactivate_locked_super(s);
error = PTR_ERR(new_root);
- dput(root);
goto error_free_subvol_name;
}
if (!new_root->d_inode) {
@@ -675,6 +708,8 @@ error_s:
error = PTR_ERR(s);
error_close_devices:
btrfs_close_devices(fs_devices);
+ kfree(fs_info);
+ kfree(tree_root);
error_free_subvol_name:
kfree(subvol_name);
return ERR_PTR(error);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1fffbc017bd..f50e931fc21 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -902,6 +902,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct inode *parent_inode;
+ struct dentry *parent;
struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
@@ -941,7 +942,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
trans->block_rsv = &pending->block_rsv;
dentry = pending->dentry;
- parent_inode = dentry->d_parent->d_inode;
+ parent = dget_parent(dentry);
+ parent_inode = parent->d_inode;
parent_root = BTRFS_I(parent_inode)->root;
record_root_in_trans(trans, parent_root);
@@ -989,6 +991,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
parent_inode->i_ino, index,
dentry->d_name.name, dentry->d_name.len);
BUG_ON(ret);
+ dput(parent);
key.offset = (u64)-1;
pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a29f19384a2..054744ac571 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
{
int ret = 0;
struct btrfs_root *root;
+ struct dentry *old_parent = NULL;
/*
* for regular files, if its inode is already on disk, we don't
@@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
if (IS_ROOT(parent))
break;
- parent = parent->d_parent;
+ parent = dget_parent(parent);
+ dput(old_parent);
+ old_parent = parent;
inode = parent->d_inode;
}
+ dput(old_parent);
out:
return ret;
}
@@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
{
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
+ struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = root->fs_info->last_trans_committed;
@@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (IS_ROOT(parent))
break;
- parent = parent->d_parent;
+ parent = dget_parent(parent);
+ dput(old_parent);
+ old_parent = parent;
}
ret = 0;
end_trans:
+ dput(old_parent);
if (ret < 0) {
BUG_ON(ret != -ENOSPC);
root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -3039,8 +3047,13 @@ end_no_trans:
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry)
{
- return btrfs_log_inode_parent(trans, root, dentry->d_inode,
- dentry->d_parent, 0);
+ struct dentry *parent = dget_parent(dentry);
+ int ret;
+
+ ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+ dput(parent);
+
+ return ret;
}
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cc04dc1445d..6b988450783 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -412,12 +412,16 @@ static noinline int device_list_add(const char *path,
device->fs_devices = fs_devices;
fs_devices->num_devices++;
- } else if (strcmp(device->name, path)) {
+ } else if (!device->name || strcmp(device->name, path)) {
name = kstrdup(path, GFP_NOFS);
if (!name)
return -ENOMEM;
kfree(device->name);
device->name = name;
+ if (device->missing) {
+ fs_devices->missing_devices--;
+ device->missing = 0;
+ }
}
if (found_transid > fs_devices->latest_trans) {
@@ -1236,6 +1240,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
device->fs_devices->num_devices--;
+ if (device->missing)
+ root->fs_info->fs_devices->missing_devices--;
+
next_device = list_entry(root->fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
if (device->bdev == root->fs_info->sb->s_bdev)
@@ -3080,7 +3087,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
device->devid = devid;
device->work.func = pending_bios_fn;
device->fs_devices = fs_devices;
+ device->missing = 1;
fs_devices->num_devices++;
+ fs_devices->missing_devices++;
spin_lock_init(&device->io_lock);
INIT_LIST_HEAD(&device->dev_alloc_list);
memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3278,6 +3287,15 @@ static int read_one_dev(struct btrfs_root *root,
device = add_missing_dev(root, devid, dev_uuid);
if (!device)
return -ENOMEM;
+ } else if (!device->missing) {
+ /*
+ * this happens when a device that was properly setup
+ * in the device info lists suddenly goes bad.
+ * device->bdev is NULL, and so we have to set
+ * device->missing to one here
+ */
+ root->fs_info->fs_devices->missing_devices++;
+ device->missing = 1;
}
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4ee..2740db49eb0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -44,6 +44,7 @@ struct btrfs_device {
int writeable;
int in_fs_metadata;
+ int missing;
spinlock_t io_lock;
@@ -93,6 +94,7 @@ struct btrfs_fs_devices {
u64 num_devices;
u64 open_devices;
u64 rw_devices;
+ u64 missing_devices;
u64 total_rw_bytes;
struct block_device *latest_bdev;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e9c874abc9e..561438b6a50 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
page->index << PAGE_CACHE_SHIFT, &len,
ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1);
+ &page, 1, 0);
if (err == -ENOENT)
err = 0;
if (err < 0) {
@@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
offset, &len,
ci->i_truncate_seq, ci->i_truncate_size,
- pages, nr_pages);
+ pages, nr_pages, 0);
if (rc == -ENOENT)
rc = 0;
if (rc < 0)
@@ -774,7 +774,7 @@ get_more_pages:
snapc, do_sync,
ci->i_truncate_seq,
ci->i_truncate_size,
- &inode->i_mtime, true, 1);
+ &inode->i_mtime, true, 1, 0);
max_pages = req->r_num_pages;
alloc_page_vec(fsc, req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98ab13e2b71..60d27bc9eb8 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
invalidating_gen == ci->i_rdcache_gen) {
/* success. */
dout("try_nonblocking_invalidate %p success\n", inode);
- ci->i_rdcache_gen = 0;
- ci->i_rdcache_revoking = 0;
+ /* save any racing async invalidate some trouble */
+ ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
return 0;
}
dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
- unsigned seq = le32_to_cpu(grant->seq);
- unsigned issue_seq = le32_to_cpu(grant->issue_seq);
+ int seq = le32_to_cpu(grant->seq);
int newcaps = le32_to_cpu(grant->caps);
int issued, implemented, used, wanted, dirty;
u64 size = le64_to_cpu(grant->size);
@@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
int revoked_rdcache = 0;
int queue_invalidate = 0;
- dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
- inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
+ dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+ inode, cap, mds, seq, ceph_cap_string(newcaps));
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
inode->i_size);
@@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
cap->seq = seq;
- cap->issue_seq = issue_seq;
/* file layout may have changed */
ci->i_layout = grant->layout;
@@ -2691,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
NULL /* no caps context */);
try_flush_caps(inode, session, NULL);
up_read(&mdsc->snap_rwsem);
+
+ /* make sure we re-request max_size, if necessary */
+ spin_lock(&inode->i_lock);
+ ci->i_requested_max_size = 0;
+ spin_unlock(&inode->i_lock);
}
/*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e0a2dc6fcaf..d902948a90d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,7 +40,8 @@ int ceph_init_dentry(struct dentry *dentry)
if (dentry->d_fsdata)
return 0;
- if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+ if (dentry->d_parent == NULL || /* nfs fh_to_dentry */
+ ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
dentry->d_op = &ceph_dentry_ops;
else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
dentry->d_op = &ceph_snapdir_dentry_ops;
@@ -114,8 +115,8 @@ static int __dcache_readdir(struct file *filp,
spin_lock(&dcache_lock);
/* start at beginning? */
- if (filp->f_pos == 2 || (last &&
- filp->f_pos < ceph_dentry(last)->offset)) {
+ if (filp->f_pos == 2 || last == NULL ||
+ filp->f_pos < ceph_dentry(last)->offset) {
if (list_empty(&parent->d_subdirs))
goto out_unlock;
p = parent->d_subdirs.prev;
@@ -336,7 +337,10 @@ more:
if (req->r_reply_info.dir_end) {
kfree(fi->last_name);
fi->last_name = NULL;
- fi->next_offset = 2;
+ if (ceph_frag_is_rightmost(frag))
+ fi->next_offset = 2;
+ else
+ fi->next_offset = 0;
} else {
rinfo = &req->r_reply_info;
err = note_last_dentry(fi,
@@ -355,18 +359,22 @@ more:
u64 pos = ceph_make_fpos(frag, off);
struct ceph_mds_reply_inode *in =
rinfo->dir_in[off - fi->offset].in;
+ struct ceph_vino vino;
+ ino_t ino;
+
dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
off, off - fi->offset, rinfo->dir_nr, pos,
rinfo->dir_dname_len[off - fi->offset],
rinfo->dir_dname[off - fi->offset], in);
BUG_ON(!in);
ftype = le32_to_cpu(in->mode) >> 12;
+ vino.ino = le64_to_cpu(in->ino);
+ vino.snap = le64_to_cpu(in->snapid);
+ ino = ceph_vino_to_ino(vino);
if (filldir(dirent,
rinfo->dir_dname[off - fi->offset],
rinfo->dir_dname_len[off - fi->offset],
- pos,
- le64_to_cpu(in->ino),
- ftype) < 0) {
+ pos, ino, ftype) < 0) {
dout("filldir stopping us...\n");
return 0;
}
@@ -414,6 +422,7 @@ static void reset_readdir(struct ceph_file_info *fi)
fi->last_readdir = NULL;
}
kfree(fi->last_name);
+ fi->last_name = NULL;
fi->next_offset = 2; /* compensate for . and .. */
if (fi->dentry) {
dput(fi->dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e77c28cf369..7d0e4a82d89 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
}
/*
- * No need to block if we have any caps. Update wanted set
+ * No need to block if we have caps on the auth MDS (for
+ * write) or any MDS (for read). Update wanted set
* asynchronously.
*/
spin_lock(&inode->i_lock);
- if (__ceph_is_any_real_caps(ci)) {
+ if (__ceph_is_any_real_caps(ci) &&
+ (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
int mds_wanted = __ceph_caps_mds_wanted(ci);
int issued = __ceph_caps_issued(ci, NULL);
@@ -280,11 +282,13 @@ int ceph_release(struct inode *inode, struct file *file)
static int striped_read(struct inode *inode,
u64 off, u64 len,
struct page **pages, int num_pages,
- int *checkeof)
+ int *checkeof, bool align_to_pages,
+ unsigned long buf_align)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 pos, this_len;
+ int io_align, page_align;
int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
int left, pages_left;
int read;
@@ -300,14 +304,19 @@ static int striped_read(struct inode *inode,
page_pos = pages;
pages_left = num_pages;
read = 0;
+ io_align = off & ~PAGE_MASK;
more:
+ if (align_to_pages)
+ page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+ else
+ page_align = pos & ~PAGE_MASK;
this_len = left;
ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
&ci->i_layout, pos, &this_len,
ci->i_truncate_seq,
ci->i_truncate_size,
- page_pos, pages_left);
+ page_pos, pages_left, page_align);
hit_stripe = this_len < left;
was_short = ret >= 0 && ret < this_len;
if (ret == -ENOENT)
@@ -368,32 +377,34 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
struct inode *inode = file->f_dentry->d_inode;
struct page **pages;
u64 off = *poff;
- int num_pages = calc_pages_for(off, len);
- int ret;
+ int num_pages, ret;
dout("sync_read on file %p %llu~%u %s\n", file, off, len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
if (file->f_flags & O_DIRECT) {
- pages = ceph_get_direct_page_vector(data, num_pages, off, len);
-
- /*
- * flush any page cache pages in this range. this
- * will make concurrent normal and O_DIRECT io slow,
- * but it will at least behave sensibly when they are
- * in sequence.
- */
+ num_pages = calc_pages_for((unsigned long)data, len);
+ pages = ceph_get_direct_page_vector(data, num_pages, true);
} else {
+ num_pages = calc_pages_for(off, len);
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
}
if (IS_ERR(pages))
return PTR_ERR(pages);
+ /*
+ * flush any page cache pages in this range. this
+ * will make concurrent normal and sync io slow,
+ * but it will at least behave sensibly when they are
+ * in sequence.
+ */
ret = filemap_write_and_wait(inode->i_mapping);
if (ret < 0)
goto done;
- ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+ ret = striped_read(inode, off, len, pages, num_pages, checkeof,
+ file->f_flags & O_DIRECT,
+ (unsigned long)data & ~PAGE_MASK);
if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
@@ -402,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
done:
if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages);
+ ceph_put_page_vector(pages, num_pages, true);
else
ceph_release_page_vector(pages, num_pages);
dout("sync_read result %d\n", ret);
@@ -448,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
int flags;
int do_sync = 0;
int check_caps = 0;
+ int page_align, io_align;
+ unsigned long buf_align;
int ret;
struct timespec mtime = CURRENT_TIME;
@@ -462,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
else
pos = *offset;
+ io_align = pos & ~PAGE_MASK;
+ buf_align = (unsigned long)data & ~PAGE_MASK;
+
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
if (ret < 0)
return ret;
@@ -486,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
*/
more:
len = left;
+ if (file->f_flags & O_DIRECT) {
+ /* write from beginning of first page, regardless of
+ io alignment */
+ page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+ num_pages = calc_pages_for((unsigned long)data, len);
+ } else {
+ page_align = pos & ~PAGE_MASK;
+ num_pages = calc_pages_for(pos, len);
+ }
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), pos, &len,
CEPH_OSD_OP_WRITE, flags,
ci->i_snap_realm->cached_context,
do_sync,
ci->i_truncate_seq, ci->i_truncate_size,
- &mtime, false, 2);
+ &mtime, false, 2, page_align);
if (!req)
return -ENOMEM;
- num_pages = calc_pages_for(pos, len);
-
if (file->f_flags & O_DIRECT) {
- pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
+ pages = ceph_get_direct_page_vector(data, num_pages, false);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
@@ -549,7 +572,7 @@ more:
}
if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages);
+ ceph_put_page_vector(pages, num_pages, false);
else if (file->f_flags & O_SYNC)
ceph_release_page_vector(pages, num_pages);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1d6a45b5a04..bf1286588f2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2,7 +2,6 @@
#include <linux/module.h>
#include <linux/fs.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -471,7 +470,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
if (issued & (CEPH_CAP_FILE_EXCL|
CEPH_CAP_FILE_WR|
- CEPH_CAP_FILE_BUFFER)) {
+ CEPH_CAP_FILE_BUFFER|
+ CEPH_CAP_AUTH_EXCL|
+ CEPH_CAP_XATTR_EXCL)) {
if (timespec_compare(ctime, &inode->i_ctime) > 0) {
dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -511,7 +512,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
warn = 1;
}
} else {
- /* we have no write caps; whatever the MDS says is true */
+ /* we have no write|excl caps; whatever the MDS says is true */
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
inode->i_ctime = *ctime;
inode->i_mtime = *mtime;
@@ -567,12 +568,17 @@ static int fill_inode(struct inode *inode,
/*
* provided version will be odd if inode value is projected,
- * even if stable. skip the update if we have a newer info
- * (e.g., due to inode info racing form multiple MDSs), or if
- * we are getting projected (unstable) inode info.
+ * even if stable. skip the update if we have newer stable
+ * info (ours>=theirs, e.g. due to racing mds replies), unless
+ * we are getting projected (unstable) info (in which case the
+ * version is odd, and we want ours>theirs).
+ * us them
+ * 2 2 skip
+ * 3 2 skip
+ * 3 3 update
*/
if (le64_to_cpu(info->version) > 0 &&
- (ci->i_version & ~1) > le64_to_cpu(info->version))
+ (ci->i_version & ~1) >= le64_to_cpu(info->version))
goto no_change;
issued = __ceph_caps_issued(ci, &implemented);
@@ -606,7 +612,14 @@ static int fill_inode(struct inode *inode,
le32_to_cpu(info->time_warp_seq),
&ctime, &mtime, &atime);
- ci->i_max_size = le64_to_cpu(info->max_size);
+ /* only update max_size on auth cap */
+ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ ci->i_max_size != le64_to_cpu(info->max_size)) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size,
+ le64_to_cpu(info->max_size));
+ ci->i_max_size = le64_to_cpu(info->max_size);
+ }
+
ci->i_layout = info->layout;
inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -1055,7 +1068,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
ininfo = rinfo->targeti.in;
vino.ino = le64_to_cpu(ininfo->ino);
vino.snap = le64_to_cpu(ininfo->snapid);
- if (!dn->d_inode) {
+ in = dn->d_inode;
+ if (!in) {
in = ceph_get_inode(sb, vino);
if (IS_ERR(in)) {
pr_err("fill_trace bad get_inode "
@@ -1386,11 +1400,8 @@ static void ceph_invalidate_work(struct work_struct *work)
spin_lock(&inode->i_lock);
dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking);
- if (ci->i_rdcache_gen == 0 ||
- ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
+ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
/* nevermind! */
- ci->i_rdcache_revoking = 0;
spin_unlock(&inode->i_lock);
goto out;
}
@@ -1400,15 +1411,16 @@ static void ceph_invalidate_work(struct work_struct *work)
ceph_invalidate_nondirty_pages(inode->i_mapping);
spin_lock(&inode->i_lock);
- if (orig_gen == ci->i_rdcache_gen) {
+ if (orig_gen == ci->i_rdcache_gen &&
+ orig_gen == ci->i_rdcache_revoking) {
dout("invalidate_pages %p gen %d successful\n", inode,
ci->i_rdcache_gen);
- ci->i_rdcache_gen = 0;
- ci->i_rdcache_revoking = 0;
+ ci->i_rdcache_revoking--;
check = 1;
} else {
- dout("invalidate_pages %p gen %d raced, gen now %d\n",
- inode, orig_gen, ci->i_rdcache_gen);
+ dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
+ inode, orig_gen, ci->i_rdcache_gen,
+ ci->i_rdcache_revoking);
}
spin_unlock(&inode->i_lock);
@@ -1739,7 +1751,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
return 0;
}
- dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+ dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
return 0;
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index a6ce54e94eb..52e8fd74d45 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
#include <linux/ioctl.h>
#include <linux/types.h>
-#define CEPH_IOCTL_MAGIC 0x98
+#define CEPH_IOCTL_MAGIC 0x97
/* just use u64 to align sanely on all archs */
struct ceph_ioctl_layout {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 40abde93c34..476b329867d 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -11,40 +11,68 @@
* Implement fcntl and flock locking functions.
*/
static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
- u64 pid, u64 pid_ns,
- int cmd, u64 start, u64 length, u8 wait)
+ int cmd, u8 wait, struct file_lock *fl)
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_mds_client *mdsc =
ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
int err;
+ u64 length = 0;
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_inode = igrab(inode);
+ /* mds requires start and length rather than start and end */
+ if (LLONG_MAX == fl->fl_end)
+ length = 0;
+ else
+ length = fl->fl_end - fl->fl_start + 1;
+
dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
"length: %llu, wait: %d, type`: %d", (int)lock_type,
- (int)operation, pid, start, length, wait, cmd);
+ (int)operation, (u64)fl->fl_pid, fl->fl_start,
+ length, wait, fl->fl_type);
+
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
- req->r_args.filelock_change.pid = cpu_to_le64(pid);
+ req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
/* This should be adjusted, but I'm not sure if
namespaces actually get id numbers*/
req->r_args.filelock_change.pid_namespace =
- cpu_to_le64((u64)pid_ns);
- req->r_args.filelock_change.start = cpu_to_le64(start);
+ cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
+ req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
err = ceph_mdsc_do_request(mdsc, inode, req);
+
+ if ( operation == CEPH_MDS_OP_GETFILELOCK){
+ fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+ if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+ fl->fl_type = F_RDLCK;
+ else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+ fl->fl_type = F_WRLCK;
+ else
+ fl->fl_type = F_UNLCK;
+
+ fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+ length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+ le64_to_cpu(req->r_reply_info.filelock_reply->length);
+ if (length >= 1)
+ fl->fl_end = length -1;
+ else
+ fl->fl_end = 0;
+
+ }
ceph_mdsc_put_request(req);
dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
- (int)operation, pid, start, length, wait, cmd, err);
+ "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
+ (int)operation, (u64)fl->fl_pid, fl->fl_start,
+ length, wait, fl->fl_type, err);
return err;
}
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
*/
int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
{
- u64 length;
u8 lock_cmd;
int err;
u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
- if (LLONG_MAX == fl->fl_end)
- length = 0;
- else
- length = fl->fl_end - fl->fl_start + 1;
-
- err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- (u64)fl->fl_pid,
- (u64)(unsigned long)fl->fl_nspid,
- lock_cmd, fl->fl_start,
- length, wait);
+ err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
if (!err) {
- dout("mds locked, locking locally");
- err = posix_lock_file(file, fl, NULL);
- if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
- /* undo! This should only happen if the kernel detects
- * local deadlock. */
- ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- (u64)fl->fl_pid,
- (u64)(unsigned long)fl->fl_nspid,
- CEPH_LOCK_UNLOCK, fl->fl_start,
- length, 0);
- dout("got %d on posix_lock_file, undid lock", err);
+ if ( op != CEPH_MDS_OP_GETFILELOCK ){
+ dout("mds locked, locking locally");
+ err = posix_lock_file(file, fl, NULL);
+ if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+ /* undo! This should only happen if the kernel detects
+ * local deadlock. */
+ ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+ CEPH_LOCK_UNLOCK, 0, fl);
+ dout("got %d on posix_lock_file, undid lock", err);
+ }
}
+
} else {
dout("mds returned error code %d", err);
}
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
- u64 length;
u8 lock_cmd;
int err;
u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
- /* mds requires start and length rather than start and end */
- if (LLONG_MAX == fl->fl_end)
- length = 0;
- else
- length = fl->fl_end - fl->fl_start + 1;
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
- file, (u64)fl->fl_pid,
- (u64)(unsigned long)fl->fl_nspid,
- lock_cmd, fl->fl_start,
- length, wait);
+ file, lock_cmd, wait, fl);
if (!err) {
err = flock_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
- file, (u64)fl->fl_pid,
- (u64)(unsigned long)fl->fl_nspid,
- CEPH_LOCK_UNLOCK, fl->fl_start,
- length, 0);
+ file, CEPH_LOCK_UNLOCK, 0, fl);
dout("got %d on flock_lock_file_wait, undid lock", err);
}
} else {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3142b15940c..38800eaa81d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -6,7 +6,6 @@
#include <linux/sched.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
-#include <linux/smp_lock.h>
#include "super.h"
#include "mds_client.h"
@@ -203,6 +202,38 @@ out_bad:
}
/*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info)
+{
+ if (*p + sizeof(*info->filelock_reply) > end)
+ goto bad;
+
+ info->filelock_reply = *p;
+ *p += sizeof(*info->filelock_reply);
+
+ if (unlikely(*p != end))
+ goto bad;
+ return 0;
+
+bad:
+ return -EIO;
+}
+
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info)
+{
+ if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+ return parse_reply_info_filelock(p, end, info);
+ else
+ return parse_reply_info_dir(p, end, info);
+}
+
+/*
* parse entire mds reply
*/
static int parse_reply_info(struct ceph_msg *msg,
@@ -224,10 +255,10 @@ static int parse_reply_info(struct ceph_msg *msg,
goto out_bad;
}
- /* dir content */
+ /* extra */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
- err = parse_reply_info_dir(&p, p+len, info);
+ err = parse_reply_info_extra(&p, p+len, info);
if (err < 0)
goto out_bad;
}
@@ -529,6 +560,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
ceph_mdsc_get_request(req);
__insert_request(mdsc, req);
+ req->r_uid = current_fsuid();
+ req->r_gid = current_fsgid();
+
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
@@ -1588,8 +1622,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
head->op = cpu_to_le32(req->r_op);
- head->caller_uid = cpu_to_le32(current_fsuid());
- head->caller_gid = cpu_to_le32(current_fsgid());
+ head->caller_uid = cpu_to_le32(req->r_uid);
+ head->caller_gid = cpu_to_le32(req->r_gid);
head->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
@@ -2072,7 +2106,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&session->s_mutex);
if (err < 0) {
- pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+ pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
ceph_msg_dump(msg);
goto out_err;
}
@@ -2092,7 +2126,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&req->r_fill_mutex);
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
- if (result == 0 && rinfo->dir_nr)
+ if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+ rinfo->dir_nr)
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d66d63c7235..aabe563b54d 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -42,26 +42,37 @@ struct ceph_mds_reply_info_in {
};
/*
- * parsed info about an mds reply, including information about the
- * target inode and/or its parent directory and dentry, and directory
- * contents (for readdir results).
+ * parsed info about an mds reply, including information about
+ * either: 1) the target inode and/or its parent directory and dentry,
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
*/
struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_head *head;
+ /* trace */
struct ceph_mds_reply_info_in diri, targeti;
struct ceph_mds_reply_dirfrag *dirfrag;
char *dname;
u32 dname_len;
struct ceph_mds_reply_lease *dlease;
- struct ceph_mds_reply_dirfrag *dir_dir;
- int dir_nr;
- char **dir_dname;
- u32 *dir_dname_len;
- struct ceph_mds_reply_lease **dir_dlease;
- struct ceph_mds_reply_info_in *dir_in;
- u8 dir_complete, dir_end;
+ /* extra */
+ union {
+ /* for fcntl F_GETLK results */
+ struct ceph_filelock *filelock_reply;
+
+ /* for readdir results */
+ struct {
+ struct ceph_mds_reply_dirfrag *dir_dir;
+ int dir_nr;
+ char **dir_dname;
+ u32 *dir_dname_len;
+ struct ceph_mds_reply_lease **dir_dlease;
+ struct ceph_mds_reply_info_in *dir_in;
+ u8 dir_complete, dir_end;
+ };
+ };
/* encoded blob describing snapshot contexts for certain
operations (e.g., open) */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
+ uid_t r_uid;
+ gid_t r_gid;
/* for choosing which mds to send this request to */
int r_direct_mode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1886294e12f..7f01728a465 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -293,9 +293,7 @@ struct ceph_inode_info {
int i_rd_ref, i_rdcache_ref, i_wr_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
u32 i_shared_gen; /* increment each time we get FILE_SHARED */
- u32 i_rdcache_gen; /* we increment this each time we get
- FILE_CACHE. If it's non-zero, we
- _may_ have cached pages. */
+ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
struct list_head i_unsafe_writes; /* uncommitted sync writes */
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 0ed213970ce..ee45648b0d1 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -4,6 +4,7 @@ config CIFS
select NLS
select CRYPTO
select CRYPTO_MD5
+ select CRYPTO_HMAC
select CRYPTO_ARC4
help
This is the client VFS module for the Common Internet File System
@@ -143,6 +144,13 @@ config CIFS_FSCACHE
to be cached locally on disk through the general filesystem cache
manager. If unsure, say N.
+config CIFS_ACL
+ bool "Provide CIFS ACL support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && CIFS_XATTR
+ help
+ Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
+ is handed over to the application/caller.
+
config CIFS_EXPERIMENTAL
bool "CIFS Experimental Features (EXPERIMENTAL)"
depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index adefa60a9bd..43b19dd3919 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,9 @@ obj-$(CONFIG_CIFS) += cifs.o
cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
- readdir.o ioctl.o sess.o export.o cifsacl.o
+ readdir.o ioctl.o sess.o export.o
+
+cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/README b/fs/cifs/README
index ee68d103654..46af99ab361 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -337,6 +337,15 @@ A partial list of the supported mount options follows:
wsize default write size (default 57344)
maximum wsize currently allowed by CIFS is 57344 (fourteen
4096 byte pages)
+ actimeo=n attribute cache timeout in seconds (default 1 second).
+ After this timeout, the cifs client requests fresh attribute
+ information from the server. This option allows to tune the
+ attribute cache timeout to suit the workload needs. Shorter
+ timeouts mean better the cache coherency, but increased number
+ of calls to the server. Longer timeouts mean reduced number
+ of calls to the server at the expense of less stricter cache
+ coherency checks (i.e. incorrect attribute cache for a short
+ period of time).
rw mount the network share read-write (note that the
server may still consider the share read-only)
ro mount network share read-only
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e5..355abcdcda9 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
v) mount check for unmatched uids
-w) Add support for new vfs entry points for setlease and fallocate
+w) Add support for new vfs entry point for fallocate
x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 525ba59a410..7852cd67705 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,7 +15,7 @@
* the GNU Lesser General Public License for more details.
*
*/
-#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
#ifndef _CIFS_FS_SB_H
#define _CIFS_FS_SB_H
@@ -42,12 +42,13 @@
#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
struct cifs_sb_info {
- struct radix_tree_root tlink_tree;
-#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */
+ struct rb_root tlink_tree;
spinlock_t tlink_tree_lock;
+ struct tcon_link *master_tlink;
struct nls_table *local_nls;
unsigned int rsize;
unsigned int wsize;
+ unsigned long actimeo; /* attribute cache timeout (jiffies) */
atomic_t active;
uid_t mnt_uid;
gid_t mnt_gid;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c9b4792ae82..a437ec391a0 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -30,8 +30,6 @@
#include "cifs_debug.h"
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-
static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
{{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
{{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
@@ -560,7 +558,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
- return NULL;
+ return ERR_CAST(tlink);
xid = GetXid();
rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
@@ -568,7 +566,9 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
cifs_put_tlink(tlink);
- cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
+ cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+ if (rc)
+ return ERR_PTR(rc);
return pntsd;
}
@@ -583,7 +583,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
- return NULL;
+ return ERR_CAST(tlink);
tcon = tlink_tcon(tlink);
xid = GetXid();
@@ -591,23 +591,22 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
&fid, &oplock, NULL, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc) {
- cERROR(1, "Unable to open file to get ACL");
- goto out;
+ if (!rc) {
+ rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
+ CIFSSMBClose(xid, tcon, fid);
}
- rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
- cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
-
- CIFSSMBClose(xid, tcon, fid);
- out:
cifs_put_tlink(tlink);
FreeXid(xid);
+
+ cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+ if (rc)
+ return ERR_PTR(rc);
return pntsd;
}
/* Retrieve an ACL from the server */
-static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
struct inode *inode, const char *path,
u32 *pacllen)
{
@@ -695,7 +694,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
}
/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void
+int
cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
struct inode *inode, const char *path, const __u16 *pfid)
{
@@ -711,17 +710,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
- if (pntsd)
+ if (IS_ERR(pntsd)) {
+ rc = PTR_ERR(pntsd);
+ cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+ } else {
rc = parse_sec_desc(pntsd, acllen, fattr);
- if (rc)
- cFYI(1, "parse sec desc failed rc = %d", rc);
+ kfree(pntsd);
+ if (rc)
+ cERROR(1, "parse sec desc failed rc = %d", rc);
+ }
- kfree(pntsd);
- return;
+ return rc;
}
/* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
+int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
{
int rc = 0;
__u32 secdesclen = 0;
@@ -736,7 +739,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
/* Add three ACEs for owner, group, everyone getting rid of
other ACEs as chmod disables ACEs and set the security descriptor */
- if (pntsd) {
+ if (IS_ERR(pntsd)) {
+ rc = PTR_ERR(pntsd);
+ cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+ } else {
/* allocate memory for the smb header,
set security descriptor request security descriptor
parameters, and secuirty descriptor itself */
@@ -766,4 +772,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
return rc;
}
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 6c8096cf515..c4ae7d03656 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -74,11 +74,7 @@ struct cifs_wksid {
char sidname[SIDNAMELENGTH];
} __attribute__((packed));
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-
extern int match_sid(struct cifs_sid *);
extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
-
#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 75c4eaa7958..3936aa7f2c2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, void *data,
return -ENOMEM;
spin_lock_init(&cifs_sb->tlink_tree_lock);
- INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL);
+ cifs_sb->tlink_tree = RB_ROOT;
rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
if (rc) {
@@ -321,8 +321,7 @@ cifs_alloc_inode(struct super_block *sb)
/* Until the file is open and we have gotten oplock
info back from the server, can not assume caching of
file data or metadata */
- cifs_inode->clientCanCacheRead = false;
- cifs_inode->clientCanCacheAll = false;
+ cifs_set_oplock_level(cifs_inode, 0);
cifs_inode->delete_pending = false;
cifs_inode->invalid_mapping = false;
cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
@@ -459,9 +458,13 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
seq_printf(s, ",acl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
seq_printf(s, ",mfsymlinks");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
+ seq_printf(s, ",fsc");
seq_printf(s, ",rsize=%d", cifs_sb->rsize);
seq_printf(s, ",wsize=%d", cifs_sb->wsize);
+ /* convert actimeo and display it in seconds */
+ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
return 0;
}
@@ -934,7 +937,6 @@ init_cifs(void)
GlobalCurrentXid = 0;
GlobalTotalActiveXid = 0;
GlobalMaxActiveXid = 0;
- memset(Local_System_Name, 0, 15);
spin_lock_init(&cifs_tcp_ses_lock);
spin_lock_init(&cifs_file_list_lock);
spin_lock_init(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f259e4d7612..7136c0c3e2f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -45,6 +45,16 @@
#define CIFS_MIN_RCV_POOL 4
/*
+ * default attribute cache timeout (jiffies)
+ */
+#define CIFS_DEF_ACTIMEO (1 * HZ)
+
+/*
+ * max attribute cache timeout (jiffies) - 2^30
+ */
+#define CIFS_MAX_ACTIMEO (1 << 30)
+
+/*
* MAX_REQ is the maximum number of requests that WE will send
* on one socket concurrently. It also matches the most common
* value of max multiplex returned by servers. We may
@@ -336,7 +346,8 @@ struct cifsTconInfo {
* "get" on the container.
*/
struct tcon_link {
- unsigned long tl_index;
+ struct rb_node tl_rbnode;
+ uid_t tl_uid;
unsigned long tl_flags;
#define TCON_LINK_MASTER 0
#define TCON_LINK_PENDING 1
@@ -745,8 +756,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */
/* on midQ entries */
-GLOBAL_EXTERN char Local_System_Name[15];
-
/*
* Global counters, updated atomically
*/
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index edb6d90efdf..e6d1481b16c 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -54,7 +54,8 @@ do { \
__func__, curr_xid, (int)rc); \
} while (0)
extern char *build_path_from_dentry(struct dentry *);
-extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+ struct cifsTconInfo *tcon);
extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
extern char *cifs_compose_mount_options(const char *sb_mountdata,
const char *fullpath, const struct dfs_info3_param *ref,
@@ -79,9 +80,7 @@ extern bool is_valid_oplock_break(struct smb_hdr *smb,
struct TCP_Server_Info *);
extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
-#endif
extern unsigned int smbCalcSize(struct smb_hdr *ptr);
extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -104,6 +103,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
extern u64 cifs_UnixTimeToNT(struct timespec);
extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
int offset);
+extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
struct file *file, struct tcon_link *tlink,
@@ -129,10 +129,12 @@ extern int cifs_get_file_info_unix(struct file *filp);
extern int cifs_get_inode_info_unix(struct inode **pinode,
const unsigned char *search_path,
struct super_block *sb, int xid);
-extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
+extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr, struct inode *inode,
const char *path, const __u16 *pfid);
-extern int mode_to_acl(struct inode *inode, const char *path, __u64);
+extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
+ const char *, u32 *);
extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
const char *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f2632b6df5..67acfb3acad 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2478,95 +2478,6 @@ querySymLinkRetry:
}
#ifdef CONFIG_CIFS_EXPERIMENTAL
-/* Initialize NT TRANSACT SMB into small smb request buffer.
- This assumes that all NT TRANSACTS that we init here have
- total parm and data under about 400 bytes (to fit in small cifs
- buffer size), which is the case so far, it easily fits. NB:
- Setup words themselves and ByteCount
- MaxSetupCount (size of returned setup area) and
- MaxParameterCount (returned parms size) must be set by caller */
-static int
-smb_init_nttransact(const __u16 sub_command, const int setup_count,
- const int parm_len, struct cifsTconInfo *tcon,
- void **ret_buf)
-{
- int rc;
- __u32 temp_offset;
- struct smb_com_ntransact_req *pSMB;
-
- rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
- (void **)&pSMB);
- if (rc)
- return rc;
- *ret_buf = (void *)pSMB;
- pSMB->Reserved = 0;
- pSMB->TotalParameterCount = cpu_to_le32(parm_len);
- pSMB->TotalDataCount = 0;
- pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
- pSMB->ParameterCount = pSMB->TotalParameterCount;
- pSMB->DataCount = pSMB->TotalDataCount;
- temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
- (setup_count * 2) - 4 /* for rfc1001 length itself */;
- pSMB->ParameterOffset = cpu_to_le32(temp_offset);
- pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
- pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
- pSMB->SubCommand = cpu_to_le16(sub_command);
- return 0;
-}
-
-static int
-validate_ntransact(char *buf, char **ppparm, char **ppdata,
- __u32 *pparmlen, __u32 *pdatalen)
-{
- char *end_of_smb;
- __u32 data_count, data_offset, parm_count, parm_offset;
- struct smb_com_ntransact_rsp *pSMBr;
-
- *pdatalen = 0;
- *pparmlen = 0;
-
- if (buf == NULL)
- return -EINVAL;
-
- pSMBr = (struct smb_com_ntransact_rsp *)buf;
-
- /* ByteCount was converted from little endian in SendReceive */
- end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
- (char *)&pSMBr->ByteCount;
-
- data_offset = le32_to_cpu(pSMBr->DataOffset);
- data_count = le32_to_cpu(pSMBr->DataCount);
- parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
- parm_count = le32_to_cpu(pSMBr->ParameterCount);
-
- *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
- *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
-
- /* should we also check that parm and data areas do not overlap? */
- if (*ppparm > end_of_smb) {
- cFYI(1, "parms start after end of smb");
- return -EINVAL;
- } else if (parm_count + *ppparm > end_of_smb) {
- cFYI(1, "parm end after end of smb");
- return -EINVAL;
- } else if (*ppdata > end_of_smb) {
- cFYI(1, "data starts after end of smb");
- return -EINVAL;
- } else if (data_count + *ppdata > end_of_smb) {
- cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
- *ppdata, data_count, (data_count + *ppdata),
- end_of_smb, pSMBr);
- return -EINVAL;
- } else if (parm_count + data_count > pSMBr->ByteCount) {
- cFYI(1, "parm count and data count larger than SMB");
- return -EINVAL;
- }
- *pdatalen = data_count;
- *pparmlen = parm_count;
- return 0;
-}
-
int
CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
const unsigned char *searchName,
@@ -3056,7 +2967,97 @@ GetExtAttrOut:
#endif /* CONFIG_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
+/*
+ * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that
+ * all NT TRANSACTS that we init here have total parm and data under about 400
+ * bytes (to fit in small cifs buffer size), which is the case so far, it
+ * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
+ * returned setup area) and MaxParameterCount (returned parms size) must be set
+ * by caller
+ */
+static int
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
+ const int parm_len, struct cifsTconInfo *tcon,
+ void **ret_buf)
+{
+ int rc;
+ __u32 temp_offset;
+ struct smb_com_ntransact_req *pSMB;
+
+ rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
+ (void **)&pSMB);
+ if (rc)
+ return rc;
+ *ret_buf = (void *)pSMB;
+ pSMB->Reserved = 0;
+ pSMB->TotalParameterCount = cpu_to_le32(parm_len);
+ pSMB->TotalDataCount = 0;
+ pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+ MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+ pSMB->ParameterCount = pSMB->TotalParameterCount;
+ pSMB->DataCount = pSMB->TotalDataCount;
+ temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
+ (setup_count * 2) - 4 /* for rfc1001 length itself */;
+ pSMB->ParameterOffset = cpu_to_le32(temp_offset);
+ pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
+ pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
+ pSMB->SubCommand = cpu_to_le16(sub_command);
+ return 0;
+}
+
+static int
+validate_ntransact(char *buf, char **ppparm, char **ppdata,
+ __u32 *pparmlen, __u32 *pdatalen)
+{
+ char *end_of_smb;
+ __u32 data_count, data_offset, parm_count, parm_offset;
+ struct smb_com_ntransact_rsp *pSMBr;
+
+ *pdatalen = 0;
+ *pparmlen = 0;
+
+ if (buf == NULL)
+ return -EINVAL;
+
+ pSMBr = (struct smb_com_ntransact_rsp *)buf;
+
+ /* ByteCount was converted from little endian in SendReceive */
+ end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+ (char *)&pSMBr->ByteCount;
+
+ data_offset = le32_to_cpu(pSMBr->DataOffset);
+ data_count = le32_to_cpu(pSMBr->DataCount);
+ parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
+ parm_count = le32_to_cpu(pSMBr->ParameterCount);
+
+ *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
+ *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
+
+ /* should we also check that parm and data areas do not overlap? */
+ if (*ppparm > end_of_smb) {
+ cFYI(1, "parms start after end of smb");
+ return -EINVAL;
+ } else if (parm_count + *ppparm > end_of_smb) {
+ cFYI(1, "parm end after end of smb");
+ return -EINVAL;
+ } else if (*ppdata > end_of_smb) {
+ cFYI(1, "data starts after end of smb");
+ return -EINVAL;
+ } else if (data_count + *ppdata > end_of_smb) {
+ cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
+ *ppdata, data_count, (data_count + *ppdata),
+ end_of_smb, pSMBr);
+ return -EINVAL;
+ } else if (parm_count + data_count > pSMBr->ByteCount) {
+ cFYI(1, "parm count and data count larger than SMB");
+ return -EINVAL;
+ }
+ *pdatalen = data_count;
+ *pparmlen = parm_count;
+ return 0;
+}
+
/* Get Security Descriptor (by handle) from remote server for a file or dir */
int
CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
@@ -3214,7 +3215,7 @@ setCifsAclRetry:
return (rc);
}
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
+#endif /* CONFIG_CIFS_ACL */
/* Legacy Query Path Information call for lookup to old servers such
as Win9x/WinME */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9eb327defa1..cc1a8604a79 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -105,6 +105,7 @@ struct smb_vol {
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
unsigned short int port;
+ unsigned long actimeo; /* attribute cache timeout (jiffies) */
char *prepath;
struct sockaddr_storage srcaddr; /* allow binding to a local IP */
struct nls_table *local_nls;
@@ -116,6 +117,7 @@ struct smb_vol {
static int ipv4_connect(struct TCP_Server_Info *server);
static int ipv6_connect(struct TCP_Server_Info *server);
+static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
static void cifs_prune_tlinks(struct work_struct *work);
/*
@@ -805,23 +807,20 @@ cifs_parse_mount_options(char *options, const char *devname,
short int override_gid = -1;
bool uid_specified = false;
bool gid_specified = false;
+ char *nodename = utsname()->nodename;
separator[0] = ',';
separator[1] = 0;
- if (Local_System_Name[0] != 0)
- memcpy(vol->source_rfc1001_name, Local_System_Name, 15);
- else {
- char *nodename = utsname()->nodename;
- int n = strnlen(nodename, 15);
- memset(vol->source_rfc1001_name, 0x20, 15);
- for (i = 0; i < n; i++) {
- /* does not have to be perfect mapping since field is
- informational, only used for servers that do not support
- port 445 and it can be overridden at mount time */
- vol->source_rfc1001_name[i] = toupper(nodename[i]);
- }
- }
+ /*
+ * does not have to be perfect mapping since field is
+ * informational, only used for servers that do not support
+ * port 445 and it can be overridden at mount time
+ */
+ memset(vol->source_rfc1001_name, 0x20, 15);
+ for (i = 0; i < strnlen(nodename, 15); i++)
+ vol->source_rfc1001_name[i] = toupper(nodename[i]);
+
vol->source_rfc1001_name[15] = 0;
/* null target name indicates to use *SMBSERVR default called name
if we end up sending RFC1001 session initialize */
@@ -839,6 +838,8 @@ cifs_parse_mount_options(char *options, const char *devname,
/* default to using server inode numbers where available */
vol->server_ino = 1;
+ vol->actimeo = CIFS_DEF_ACTIMEO;
+
if (!options)
return 1;
@@ -1213,6 +1214,16 @@ cifs_parse_mount_options(char *options, const char *devname,
printk(KERN_WARNING "CIFS: server net"
"biosname longer than 15 truncated.\n");
}
+ } else if (strnicmp(data, "actimeo", 7) == 0) {
+ if (value && *value) {
+ vol->actimeo = HZ * simple_strtoul(value,
+ &value, 0);
+ if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+ cERROR(1, "CIFS: attribute cache"
+ "timeout too large");
+ return 1;
+ }
+ }
} else if (strnicmp(data, "credentials", 4) == 0) {
/* ignore */
} else if (strnicmp(data, "version", 3) == 0) {
@@ -1351,6 +1362,11 @@ cifs_parse_mount_options(char *options, const char *devname,
"supported. Instead set "
"/proc/fs/cifs/LookupCacheEnabled to 0\n");
} else if (strnicmp(data, "fsc", 3) == 0) {
+#ifndef CONFIG_CIFS_FSCACHE
+ cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
+ "kernel config option set");
+ return 1;
+#endif
vol->fsc = true;
} else if (strnicmp(data, "mfsymlinks", 10) == 0) {
vol->mfsymlinks = true;
@@ -2565,6 +2581,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
cFYI(1, "file mode: 0x%x dir mode: 0x%x",
cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+ cifs_sb->actimeo = pvolume_info->actimeo;
+
if (pvolume_info->noperm)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
if (pvolume_info->setuids)
@@ -2815,13 +2833,13 @@ remote_path_check:
/* check if a whole path (including prepath) is not remote */
if (!rc && cifs_sb->prepathlen && tcon) {
/* build_path_to_root works only when we have a valid tcon */
- full_path = cifs_build_path_to_root(cifs_sb);
+ full_path = cifs_build_path_to_root(cifs_sb, tcon);
if (full_path == NULL) {
rc = -ENOMEM;
goto mount_fail_check;
}
rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
- if (rc != -EREMOTE) {
+ if (rc != 0 && rc != -EREMOTE) {
kfree(full_path);
goto mount_fail_check;
}
@@ -2900,24 +2918,16 @@ remote_path_check:
goto mount_fail_check;
}
- tlink->tl_index = pSesInfo->linux_uid;
+ tlink->tl_uid = pSesInfo->linux_uid;
tlink->tl_tcon = tcon;
tlink->tl_time = jiffies;
set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
- rc = radix_tree_preload(GFP_KERNEL);
- if (rc == -ENOMEM) {
- kfree(tlink);
- goto mount_fail_check;
- }
-
+ cifs_sb->master_tlink = tlink;
spin_lock(&cifs_sb->tlink_tree_lock);
- radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink);
- radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
- CIFS_TLINK_MASTER_TAG);
+ tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
spin_unlock(&cifs_sb->tlink_tree_lock);
- radix_tree_preload_end();
queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
TLINK_IDLE_EXPIRE);
@@ -3107,32 +3117,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
int
cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
{
- int i, ret;
+ struct rb_root *root = &cifs_sb->tlink_tree;
+ struct rb_node *node;
+ struct tcon_link *tlink;
char *tmp;
- struct tcon_link *tlink[8];
- unsigned long index = 0;
cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
- do {
- spin_lock(&cifs_sb->tlink_tree_lock);
- ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
- (void **)tlink, index,
- ARRAY_SIZE(tlink));
- /* increment index for next pass */
- if (ret > 0)
- index = tlink[ret - 1]->tl_index + 1;
- for (i = 0; i < ret; i++) {
- cifs_get_tlink(tlink[i]);
- clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
- radix_tree_delete(&cifs_sb->tlink_tree,
- tlink[i]->tl_index);
- }
- spin_unlock(&cifs_sb->tlink_tree_lock);
+ spin_lock(&cifs_sb->tlink_tree_lock);
+ while ((node = rb_first(root))) {
+ tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+ cifs_get_tlink(tlink);
+ clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+ rb_erase(node, root);
- for (i = 0; i < ret; i++)
- cifs_put_tlink(tlink[i]);
- } while (ret != 0);
+ spin_unlock(&cifs_sb->tlink_tree_lock);
+ cifs_put_tlink(tlink);
+ spin_lock(&cifs_sb->tlink_tree_lock);
+ }
+ spin_unlock(&cifs_sb->tlink_tree_lock);
tmp = cifs_sb->prepath;
cifs_sb->prepathlen = 0;
@@ -3271,22 +3274,10 @@ out:
return tcon;
}
-static struct tcon_link *
+static inline struct tcon_link *
cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
{
- struct tcon_link *tlink;
- unsigned int ret;
-
- spin_lock(&cifs_sb->tlink_tree_lock);
- ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
- 0, 1, CIFS_TLINK_MASTER_TAG);
- spin_unlock(&cifs_sb->tlink_tree_lock);
-
- /* the master tcon should always be present */
- if (ret == 0)
- BUG();
-
- return tlink;
+ return cifs_sb->master_tlink;
}
struct cifsTconInfo *
@@ -3302,6 +3293,47 @@ cifs_sb_tcon_pending_wait(void *unused)
return signal_pending(current) ? -ERESTARTSYS : 0;
}
+/* find and return a tlink with given uid */
+static struct tcon_link *
+tlink_rb_search(struct rb_root *root, uid_t uid)
+{
+ struct rb_node *node = root->rb_node;
+ struct tcon_link *tlink;
+
+ while (node) {
+ tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+
+ if (tlink->tl_uid > uid)
+ node = node->rb_left;
+ else if (tlink->tl_uid < uid)
+ node = node->rb_right;
+ else
+ return tlink;
+ }
+ return NULL;
+}
+
+/* insert a tcon_link into the tree */
+static void
+tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct tcon_link *tlink;
+
+ while (*new) {
+ tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
+ parent = *new;
+
+ if (tlink->tl_uid > new_tlink->tl_uid)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&new_tlink->tl_rbnode, parent, new);
+ rb_insert_color(&new_tlink->tl_rbnode, root);
+}
+
/*
* Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
* current task.
@@ -3309,7 +3341,7 @@ cifs_sb_tcon_pending_wait(void *unused)
* If the superblock doesn't refer to a multiuser mount, then just return
* the master tcon for the mount.
*
- * First, search the radix tree for an existing tcon for this fsuid. If one
+ * First, search the rbtree for an existing tcon for this fsuid. If one
* exists, then check to see if it's pending construction. If it is then wait
* for construction to complete. Once it's no longer pending, check to see if
* it failed and either return an error or retry construction, depending on
@@ -3322,14 +3354,14 @@ struct tcon_link *
cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
{
int ret;
- unsigned long fsuid = (unsigned long) current_fsuid();
+ uid_t fsuid = current_fsuid();
struct tcon_link *tlink, *newtlink;
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
spin_lock(&cifs_sb->tlink_tree_lock);
- tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+ tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
if (tlink)
cifs_get_tlink(tlink);
spin_unlock(&cifs_sb->tlink_tree_lock);
@@ -3338,36 +3370,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
if (newtlink == NULL)
return ERR_PTR(-ENOMEM);
- newtlink->tl_index = fsuid;
+ newtlink->tl_uid = fsuid;
newtlink->tl_tcon = ERR_PTR(-EACCES);
set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
cifs_get_tlink(newtlink);
- ret = radix_tree_preload(GFP_KERNEL);
- if (ret != 0) {
- kfree(newtlink);
- return ERR_PTR(ret);
- }
-
spin_lock(&cifs_sb->tlink_tree_lock);
/* was one inserted after previous search? */
- tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+ tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
if (tlink) {
cifs_get_tlink(tlink);
spin_unlock(&cifs_sb->tlink_tree_lock);
- radix_tree_preload_end();
kfree(newtlink);
goto wait_for_construction;
}
- ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
- spin_unlock(&cifs_sb->tlink_tree_lock);
- radix_tree_preload_end();
- if (ret) {
- kfree(newtlink);
- return ERR_PTR(ret);
- }
tlink = newtlink;
+ tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+ spin_unlock(&cifs_sb->tlink_tree_lock);
} else {
wait_for_construction:
ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
@@ -3413,39 +3433,39 @@ cifs_prune_tlinks(struct work_struct *work)
{
struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
prune_tlinks.work);
- struct tcon_link *tlink[8];
- unsigned long now = jiffies;
- unsigned long index = 0;
- int i, ret;
+ struct rb_root *root = &cifs_sb->tlink_tree;
+ struct rb_node *node = rb_first(root);
+ struct rb_node *tmp;
+ struct tcon_link *tlink;
- do {
- spin_lock(&cifs_sb->tlink_tree_lock);
- ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
- (void **)tlink, index,
- ARRAY_SIZE(tlink));
- /* increment index for next pass */
- if (ret > 0)
- index = tlink[ret - 1]->tl_index + 1;
- for (i = 0; i < ret; i++) {
- if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) ||
- atomic_read(&tlink[i]->tl_count) != 0 ||
- time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE,
- now)) {
- tlink[i] = NULL;
- continue;
- }
- cifs_get_tlink(tlink[i]);
- clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
- radix_tree_delete(&cifs_sb->tlink_tree,
- tlink[i]->tl_index);
- }
- spin_unlock(&cifs_sb->tlink_tree_lock);
+ /*
+ * Because we drop the spinlock in the loop in order to put the tlink
+ * it's not guarded against removal of links from the tree. The only
+ * places that remove entries from the tree are this function and
+ * umounts. Because this function is non-reentrant and is canceled
+ * before umount can proceed, this is safe.
+ */
+ spin_lock(&cifs_sb->tlink_tree_lock);
+ node = rb_first(root);
+ while (node != NULL) {
+ tmp = node;
+ node = rb_next(tmp);
+ tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
+
+ if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
+ atomic_read(&tlink->tl_count) != 0 ||
+ time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
+ continue;
- for (i = 0; i < ret; i++) {
- if (tlink[i] != NULL)
- cifs_put_tlink(tlink[i]);
- }
- } while (ret != 0);
+ cifs_get_tlink(tlink);
+ clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+ rb_erase(tmp, root);
+
+ spin_unlock(&cifs_sb->tlink_tree_lock);
+ cifs_put_tlink(tlink);
+ spin_lock(&cifs_sb->tlink_tree_lock);
+ }
+ spin_unlock(&cifs_sb->tlink_tree_lock);
queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
TLINK_IDLE_EXPIRE);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0eb87026cad..548f06230a6 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
/* Search for server name delimiter */
sep = memchr(hostname, '\\', len);
if (sep)
- len = sep - unc;
+ len = sep - hostname;
else
cFYI(1, "%s: probably server name is whole unc: %s",
__func__, unc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ae82159cf7f..5a28660ca2b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -146,12 +146,7 @@ client_can_cache:
rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
xid, NULL);
- if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
- pCifsInode->clientCanCacheAll = true;
- pCifsInode->clientCanCacheRead = true;
- cFYI(1, "Exclusive Oplock granted on inode %p", inode);
- } else if ((oplock & 0xF) == OPLOCK_READ)
- pCifsInode->clientCanCacheRead = true;
+ cifs_set_oplock_level(pCifsInode, oplock);
return rc;
}
@@ -253,12 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
spin_unlock(&cifs_file_list_lock);
- if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
- pCifsInode->clientCanCacheAll = true;
- pCifsInode->clientCanCacheRead = true;
- cFYI(1, "Exclusive Oplock inode %p", inode);
- } else if ((oplock & 0xF) == OPLOCK_READ)
- pCifsInode->clientCanCacheRead = true;
+ cifs_set_oplock_level(pCifsInode, oplock);
file->private_data = pCifsFile;
return pCifsFile;
@@ -271,8 +261,9 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
*/
void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
{
+ struct inode *inode = cifs_file->dentry->d_inode;
struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
- struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifsLockInfo *li, *tmp;
spin_lock(&cifs_file_list_lock);
@@ -288,8 +279,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
if (list_empty(&cifsi->openFileList)) {
cFYI(1, "closing last open instance for inode %p",
cifs_file->dentry->d_inode);
- cifsi->clientCanCacheRead = false;
- cifsi->clientCanCacheAll = false;
+ cifs_set_oplock_level(cifsi, 0);
}
spin_unlock(&cifs_file_list_lock);
@@ -607,8 +597,6 @@ reopen_success:
rc = filemap_write_and_wait(inode->i_mapping);
mapping_set_error(inode->i_mapping, rc);
- pCifsInode->clientCanCacheAll = false;
- pCifsInode->clientCanCacheRead = false;
if (tcon->unix_ext)
rc = cifs_get_inode_info_unix(&inode,
full_path, inode->i_sb, xid);
@@ -622,18 +610,9 @@ reopen_success:
invalidate the current end of file on the server
we can not go to the server to get the new inod
info */
- if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
- pCifsInode->clientCanCacheAll = true;
- pCifsInode->clientCanCacheRead = true;
- cFYI(1, "Exclusive Oplock granted on inode %p",
- pCifsFile->dentry->d_inode);
- } else if ((oplock & 0xF) == OPLOCK_READ) {
- pCifsInode->clientCanCacheRead = true;
- pCifsInode->clientCanCacheAll = false;
- } else {
- pCifsInode->clientCanCacheRead = false;
- pCifsInode->clientCanCacheAll = false;
- }
+
+ cifs_set_oplock_level(pCifsInode, oplock);
+
cifs_relock_file(pCifsFile);
reopen_error_exit:
@@ -775,12 +754,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
-
- if (file->private_data == NULL) {
- rc = -EBADF;
- FreeXid(xid);
- return rc;
- }
netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -956,6 +929,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
ssize_t cifs_user_write(struct file *file, const char __user *write_data,
size_t write_size, loff_t *poffset)
{
+ struct inode *inode = file->f_path.dentry->d_inode;
int rc = 0;
unsigned int bytes_written = 0;
unsigned int total_written;
@@ -963,7 +937,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
struct cifsTconInfo *pTcon;
int xid, long_op;
struct cifsFileInfo *open_file;
- struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1029,21 +1003,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
cifs_stats_bytes_written(pTcon, total_written);
- /* since the write may have blocked check these pointers again */
- if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
- struct inode *inode = file->f_path.dentry->d_inode;
/* Do not update local mtime - server will set its actual value on write
- * inode->i_ctime = inode->i_mtime =
- * current_fs_time(inode->i_sb);*/
- if (total_written > 0) {
- spin_lock(&inode->i_lock);
- if (*poffset > file->f_path.dentry->d_inode->i_size)
- i_size_write(file->f_path.dentry->d_inode,
- *poffset);
- spin_unlock(&inode->i_lock);
- }
- mark_inode_dirty_sync(file->f_path.dentry->d_inode);
+ * inode->i_ctime = inode->i_mtime =
+ * current_fs_time(inode->i_sb);*/
+ if (total_written > 0) {
+ spin_lock(&inode->i_lock);
+ if (*poffset > inode->i_size)
+ i_size_write(inode, *poffset);
+ spin_unlock(&inode->i_lock);
}
+ mark_inode_dirty_sync(inode);
+
FreeXid(xid);
return total_written;
}
@@ -1138,7 +1108,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
return total_written;
}
-#ifdef CONFIG_CIFS_EXPERIMENTAL
struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
bool fsuid_only)
{
@@ -1172,13 +1141,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
spin_unlock(&cifs_file_list_lock);
return NULL;
}
-#endif
struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
bool fsuid_only)
{
struct cifsFileInfo *open_file;
- struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+ struct cifs_sb_info *cifs_sb;
bool any_available = false;
int rc;
@@ -1192,6 +1160,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
return NULL;
}
+ cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
fsuid_only = false;
@@ -2299,8 +2269,10 @@ void cifs_oplock_break_get(struct cifsFileInfo *cfile)
void cifs_oplock_break_put(struct cifsFileInfo *cfile)
{
+ struct super_block *sb = cfile->dentry->d_sb;
+
cifsFileInfo_put(cfile);
- cifs_sb_deactive(cfile->dentry->d_sb);
+ cifs_sb_deactive(sb);
}
const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a2ad94efcfe..297a43d0ff7 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -2,7 +2,7 @@
* fs/cifs/fscache.c - CIFS filesystem cache interface
*
* Copyright (c) 2010 Novell, Inc.
- * Author(s): Suresh Jayaraman (sjayaraman@suse.de>
+ * Author(s): Suresh Jayaraman <sjayaraman@suse.de>
*
* This library is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
@@ -67,10 +67,12 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
if (cifsi->fscache)
return;
- cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
+ cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
&cifs_fscache_inode_object_def, cifsi);
- cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
+ cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
cifsi->fscache);
+ }
}
void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
{
if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
cifs_fscache_disable_inode_cookie(inode);
- else {
+ else
cifs_fscache_enable_inode_cookie(inode);
- cFYI(1, "CIFS: fscache inode cookie set");
- }
}
void cifs_fscache_reset_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 39869c3c3ef..589f3e3f6e0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -686,13 +686,18 @@ int cifs_get_inode_info(struct inode **pinode,
cFYI(1, "cifs_sfu_type failed: %d", tmprc);
}
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
/* fill in 0777 bits from ACL */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
- cFYI(1, "Getting mode bits from ACL");
- cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
+ rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
+ pfid);
+ if (rc) {
+ cFYI(1, "%s: Getting ACL failed with error: %d",
+ __func__, rc);
+ goto cgii_exit;
+ }
}
-#endif
+#endif /* CONFIG_CIFS_ACL */
/* fill in remaining high mode bits e.g. SUID, VTX */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
@@ -723,12 +728,12 @@ static const struct inode_operations cifs_ipc_inode_ops = {
.lookup = cifs_lookup,
};
-char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+ struct cifsTconInfo *tcon)
{
int pplen = cifs_sb->prepathlen;
int dfsplen;
char *full_path = NULL;
- struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
/* if no prefix path, simply set path to the root of share to "" */
if (pplen == 0) {
@@ -870,7 +875,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
char *full_path;
struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
- full_path = cifs_build_path_to_root(cifs_sb);
+ full_path = cifs_build_path_to_root(cifs_sb, tcon);
if (full_path == NULL)
return ERR_PTR(-ENOMEM);
@@ -881,8 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
xid, NULL);
- if (!inode)
- return ERR_PTR(rc);
+ if (!inode) {
+ inode = ERR_PTR(rc);
+ goto out;
+ }
#ifdef CONFIG_CIFS_FSCACHE
/* populate tcon->resource_id */
@@ -898,13 +905,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
inode->i_uid = cifs_sb->mnt_uid;
inode->i_gid = cifs_sb->mnt_gid;
} else if (rc) {
- kfree(full_path);
- _FreeXid(xid);
iget_failed(inode);
- return ERR_PTR(rc);
+ inode = ERR_PTR(rc);
}
-
+out:
kfree(full_path);
/* can not call macro FreeXid here since in a void func
* TODO: This is no longer true
@@ -1648,6 +1653,7 @@ static bool
cifs_inode_needs_reval(struct inode *inode)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
if (cifs_i->clientCanCacheRead)
return false;
@@ -1658,19 +1664,21 @@ cifs_inode_needs_reval(struct inode *inode)
if (cifs_i->time == 0)
return true;
- /* FIXME: the actimeo should be tunable */
- if (time_after_eq(jiffies, cifs_i->time + HZ))
+ if (!time_in_range(jiffies, cifs_i->time,
+ cifs_i->time + cifs_sb->actimeo))
return true;
/* hardlinked files w/ noserverino get "special" treatment */
- if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
S_ISREG(inode->i_mode) && inode->i_nlink != 1)
return true;
return false;
}
-/* check invalid_mapping flag and zap the cache if it's set */
+/*
+ * Zap the cache. Called when invalid_mapping flag is set.
+ */
static void
cifs_invalidate_mapping(struct inode *inode)
{
@@ -2114,11 +2122,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
if (attrs->ia_valid & ATTR_MODE) {
rc = 0;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
- rc = mode_to_acl(inode, full_path, mode);
- else
-#endif
+#ifdef CONFIG_CIFS_ACL
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+ rc = mode_to_cifs_acl(inode, full_path, mode);
+ if (rc) {
+ cFYI(1, "%s: Setting ACL failed with error: %d",
+ __func__, rc);
+ goto cifs_setattr_exit;
+ }
+ } else
+#endif /* CONFIG_CIFS_ACL */
if (((mode & S_IWUGO) == 0) &&
(cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
@@ -2177,7 +2190,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
setattr_copy(inode, attrs);
mark_inode_dirty(inode);
- return 0;
cifs_setattr_exit:
kfree(full_path);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 077bf756f34..0c98672d012 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
struct cifs_sb_info *cifs_sb;
#ifdef CONFIG_CIFS_POSIX
struct cifsFileInfo *pSMBFile = filep->private_data;
- struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink);
+ struct cifsTconInfo *tcon;
__u64 ExtAttrBits = 0;
__u64 ExtAttrMask = 0;
- __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+ __u64 caps;
#endif /* CONFIG_CIFS_POSIX */
xid = GetXid();
@@ -62,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
break;
#ifdef CONFIG_CIFS_POSIX
case FS_IOC_GETFLAGS:
+ if (pSMBFile == NULL)
+ break;
+ tcon = tlink_tcon(pSMBFile->tlink);
+ caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
if (CIFS_UNIX_EXTATTR_CAP & caps) {
- if (pSMBFile == NULL)
- break;
rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
&ExtAttrBits, &ExtAttrMask);
if (rc == 0)
@@ -75,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
break;
case FS_IOC_SETFLAGS:
+ if (pSMBFile == NULL)
+ break;
+ tcon = tlink_tcon(pSMBFile->tlink);
+ caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
if (CIFS_UNIX_EXTATTR_CAP & caps) {
if (get_user(ExtAttrBits, (int __user *)arg)) {
rc = -EFAULT;
break;
}
- if (pSMBFile == NULL)
- break;
/* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
extAttrBits, &ExtAttrMask);*/
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c4e296fe351..43f10281bc1 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -569,10 +569,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
cFYI(1, "file id match, oplock break");
pCifsInode = CIFS_I(netfile->dentry->d_inode);
- pCifsInode->clientCanCacheAll = false;
- if (pSMB->OplockLevel == 0)
- pCifsInode->clientCanCacheRead = false;
+ cifs_set_oplock_level(pCifsInode,
+ pSMB->OplockLevel);
/*
* cifs_oplock_break_put() can't be called
* from here. Get reference after queueing
@@ -722,3 +721,23 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
cifs_sb_master_tcon(cifs_sb)->treeName);
}
}
+
+void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
+{
+ oplock &= 0xF;
+
+ if (oplock == OPLOCK_EXCLUSIVE) {
+ cinode->clientCanCacheAll = true;
+ cinode->clientCanCacheRead = true;
+ cFYI(1, "Exclusive Oplock granted on inode %p",
+ &cinode->vfs_inode);
+ } else if (oplock == OPLOCK_READ) {
+ cinode->clientCanCacheAll = false;
+ cinode->clientCanCacheRead = true;
+ cFYI(1, "Level II Oplock granted on inode %p",
+ &cinode->vfs_inode);
+ } else {
+ cinode->clientCanCacheAll = false;
+ cinode->clientCanCacheRead = false;
+ }
+}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ef7bb7b50f5..a73eb9f4bda 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -226,26 +226,29 @@ static int initiate_cifs_search(const int xid, struct file *file)
char *full_path = NULL;
struct cifsFileInfo *cifsFile;
struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- struct tcon_link *tlink;
+ struct tcon_link *tlink = NULL;
struct cifsTconInfo *pTcon;
- tlink = cifs_sb_tlink(cifs_sb);
- if (IS_ERR(tlink))
- return PTR_ERR(tlink);
- pTcon = tlink_tcon(tlink);
-
- if (file->private_data == NULL)
- file->private_data =
- kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
if (file->private_data == NULL) {
- rc = -ENOMEM;
- goto error_exit;
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+
+ cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+ if (cifsFile == NULL) {
+ rc = -ENOMEM;
+ goto error_exit;
+ }
+ file->private_data = cifsFile;
+ cifsFile->tlink = cifs_get_tlink(tlink);
+ pTcon = tlink_tcon(tlink);
+ } else {
+ cifsFile = file->private_data;
+ pTcon = tlink_tcon(cifsFile->tlink);
}
- cifsFile = file->private_data;
cifsFile->invalidHandle = true;
cifsFile->srch_inf.endOfSearch = false;
- cifsFile->tlink = cifs_get_tlink(tlink);
full_path = build_path_from_dentry(file->f_path.dentry);
if (full_path == NULL) {
@@ -756,18 +759,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
ino, fattr.cf_dtype);
- /*
- * we can not return filldir errors to the caller since they are
- * "normal" when the stat blocksize is too small - we return remapped
- * error instead
- *
- * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
- * case already. Why should we be clobbering other errors from it?
- */
- if (rc) {
- cFYI(1, "filldir rc = %d", rc);
- rc = -EOVERFLOW;
- }
dput(tmp_dentry);
return rc;
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a264b744bb4..eae2a149160 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -30,10 +30,11 @@
#define MAX_EA_VALUE_SIZE 65535
#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
+#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
#define CIFS_XATTR_USER_PREFIX "user."
#define CIFS_XATTR_SYSTEM_PREFIX "system."
#define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX ".security"
+#define CIFS_XATTR_SECURITY_PREFIX "security."
#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
#define XATTR_TRUSTED_PREFIX_LEN 8
#define XATTR_SECURITY_PREFIX_LEN 9
@@ -277,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
- else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
- __u16 fid;
- int oplock = 0;
- struct cifs_ntsd *pacl = NULL;
- __u32 buflen = 0;
- if (experimEnabled)
- rc = CIFSSMBOpen(xid, pTcon, full_path,
- FILE_OPEN, GENERIC_READ, 0, &fid,
- &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- /* else rc is EOPNOTSUPP from above */
-
- if (rc == 0) {
- rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
- &buflen);
- CIFSSMBClose(xid, pTcon, fid);
- }
- }
-#endif /* EXPERIMENTAL */
#else
- cFYI(1, "query POSIX ACL not supported yet");
+ cFYI(1, "Query POSIX ACL not supported yet");
#endif /* CONFIG_CIFS_POSIX */
} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -311,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
#else
- cFYI(1, "query POSIX default ACL not supported yet");
-#endif
+ cFYI(1, "Query POSIX default ACL not supported yet");
+#endif /* CONFIG_CIFS_POSIX */
+ } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+ strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+ u32 acllen;
+ struct cifs_ntsd *pacl;
+
+ pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
+ full_path, &acllen);
+ if (IS_ERR(pacl)) {
+ rc = PTR_ERR(pacl);
+ cERROR(1, "%s: error %zd getting sec desc",
+ __func__, rc);
+ } else {
+ if (ea_value) {
+ if (acllen > buf_size)
+ acllen = -ERANGE;
+ else
+ memcpy(ea_value, pacl, acllen);
+ }
+ rc = acllen;
+ kfree(pacl);
+ }
+#else
+ cFYI(1, "Query CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
} else if (strncmp(ea_name,
CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
cFYI(1, "Trusted xattr namespace not supported yet");
diff --git a/fs/compat.c b/fs/compat.c
index c580c322fa6..eb1740ac8c0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max)
argv++;
if (i++ >= max)
return -E2BIG;
+
+ if (fatal_signal_pending(current))
+ return -ERESTARTNOHAND;
+ cond_resched();
}
}
return i;
@@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
while (len > 0) {
int offset, bytes_to_copy;
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTNOHAND;
+ goto out;
+ }
+ cond_resched();
+
offset = pos % PAGE_SIZE;
if (offset == 0)
offset = PAGE_SIZE;
@@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
struct page *page;
-#ifdef CONFIG_STACK_GROWSUP
- ret = expand_stack_downwards(bprm->vma, pos);
- if (ret < 0) {
- /* We've exceed the stack rlimit. */
- ret = -E2BIG;
- goto out;
- }
-#endif
- ret = get_user_pages(current, bprm->mm, pos,
- 1, 1, 1, &page, NULL);
- if (ret <= 0) {
- /* We've exceed the stack rlimit. */
+ page = get_arg_page(bprm, pos, 1);
+ if (!page) {
ret = -E2BIG;
goto out;
}
@@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename,
return retval;
out:
- if (bprm->mm)
+ if (bprm->mm) {
+ acct_arg_size(bprm, 0);
mmput(bprm->mm);
+ }
out_file:
if (bprm->file) {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 410ed188faa..a60579b007b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,7 +19,6 @@
#include <linux/compiler.h>
#include <linux/sched.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/ioctl.h>
#include <linux/if.h>
#include <linux/if_bridge.h>
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 253732382d3..2720178b771 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -28,7 +28,6 @@
#include <linux/key.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
-#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/crypto.h>
#include "ecryptfs_kernel.h"
diff --git a/fs/exec.c b/fs/exec.c
index 99d33a1371e..c62efcb959c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -164,7 +164,26 @@ out:
#ifdef CONFIG_MMU
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+ struct mm_struct *mm = current->mm;
+ long diff = (long)(pages - bprm->vma_pages);
+
+ if (!mm || !diff)
+ return;
+
+ bprm->vma_pages = pages;
+
+#ifdef SPLIT_RSS_COUNTING
+ add_mm_counter(mm, MM_ANONPAGES, diff);
+#else
+ spin_lock(&mm->page_table_lock);
+ add_mm_counter(mm, MM_ANONPAGES, diff);
+ spin_unlock(&mm->page_table_lock);
+#endif
+}
+
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
int write)
{
struct page *page;
@@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
struct rlimit *rlim;
+ acct_arg_size(bprm, size / PAGE_SIZE);
+
/*
* We've historically supported up to 32 pages (ARG_MAX)
* of argument strings even with small stacks
@@ -254,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
INIT_LIST_HEAD(&vma->anon_vma_chain);
+
+ err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+ if (err)
+ goto err;
+
err = insert_vm_struct(mm, vma);
if (err)
goto err;
@@ -276,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
#else
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
int write)
{
struct page *page;
@@ -1003,6 +1033,7 @@ int flush_old_exec(struct linux_binprm * bprm)
/*
* Release all of the old mmap stuff
*/
+ acct_arg_size(bprm, 0);
retval = exec_mmap(bprm->mm);
if (retval)
goto out;
@@ -1426,8 +1457,10 @@ int do_execve(const char * filename,
return retval;
out:
- if (bprm->mm)
- mmput (bprm->mm);
+ if (bprm->mm) {
+ acct_arg_size(bprm, 0);
+ mmput(bprm->mm);
+ }
out_file:
if (bprm->file) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b501..acf8695fa8f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,7 +27,6 @@
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/parser.h>
-#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/vfs.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8b5dd6369f8..94ce3d7a1c4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,7 +177,7 @@ struct mpage_da_data {
struct ext4_io_page {
struct page *p_page;
- int p_count;
+ atomic_t p_count;
};
#define MAX_IO_PAGES 128
@@ -858,6 +858,7 @@ struct ext4_inode_info {
spinlock_t i_completed_io_lock;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
+ atomic_t i_ioend_count; /* Number of outstanding io_end structs */
/*
* Transactions that contain inode's metadata needed to complete
@@ -909,6 +910,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -2060,6 +2062,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 19161647046..e659597b690 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -53,6 +53,7 @@
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
+ trace_ext4_begin_ordered_truncate(inode, new_size);
return jbd2_journal_begin_ordered_truncate(
EXT4_SB(inode->i_sb)->s_journal,
&EXT4_I(inode)->jinode,
@@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode)
handle_t *handle;
int err;
+ trace_ext4_evict_inode(inode);
if (inode->i_nlink) {
truncate_inode_pages(&inode->i_data, 0);
goto no_delete;
@@ -2123,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
*/
if (unlikely(journal_data && PageChecked(page)))
err = __ext4_journalled_writepage(page, len);
- else
+ else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
err = ext4_bio_write_page(&io_submit, page,
len, mpd->wbc);
+ else
+ err = block_write_full_page(page,
+ noalloc_get_block_write, mpd->wbc);
if (!err)
mpd->pages_written++;
@@ -5410,9 +5415,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
* will return the blocks that include the delayed allocation
* blocks for this file.
*/
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
return 0;
@@ -5649,6 +5652,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
int err, ret;
might_sleep();
+ trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1b..eb3bc2fe647 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,6 +331,30 @@ mext_out:
return err;
}
+ case FITRIM:
+ {
+ struct super_block *sb = inode->i_sb;
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&range, (struct fstrim_range *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ ret = ext4_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
+
default:
return -ENOTTY;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c58eba34724..5b4d4e3a4d5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4640,8 +4640,6 @@ do_more:
* with group lock held. generate_buddy look at
* them with group lock_held
*/
- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, block_group, bit, count);
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count);
mb_free_blocks(inode, &e4b, bit, count);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92203b8a099..dc40e75cba8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
if (namelen > EXT4_NAME_LEN)
return NULL;
if ((namelen <= 2) && (name[0] == '.') &&
- (name[1] == '.' || name[1] == '0')) {
+ (name[1] == '.' || name[1] == '\0')) {
/*
* "." or ".." will only be in the first block
* NFS may look up ".."; "." should be handled by the VFS
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a9d97..beacce11ac5 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,8 +32,14 @@
static struct kmem_cache *io_page_cachep, *io_end_cachep;
+#define WQ_HASH_SZ 37
+#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
+
int __init ext4_init_pageio(void)
{
+ int i;
+
io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
if (io_page_cachep == NULL)
return -ENOMEM;
@@ -42,6 +48,8 @@ int __init ext4_init_pageio(void)
kmem_cache_destroy(io_page_cachep);
return -ENOMEM;
}
+ for (i = 0; i < WQ_HASH_SZ; i++)
+ init_waitqueue_head(&ioend_wq[i]);
return 0;
}
@@ -52,24 +60,37 @@ void ext4_exit_pageio(void)
kmem_cache_destroy(io_page_cachep);
}
+void ext4_ioend_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = to_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+
+static void put_io_page(struct ext4_io_page *io_page)
+{
+ if (atomic_dec_and_test(&io_page->p_count)) {
+ end_page_writeback(io_page->p_page);
+ put_page(io_page->p_page);
+ kmem_cache_free(io_page_cachep, io_page);
+ }
+}
+
void ext4_free_io_end(ext4_io_end_t *io)
{
int i;
+ wait_queue_head_t *wq;
BUG_ON(!io);
if (io->page)
put_page(io->page);
- for (i = 0; i < io->num_io_pages; i++) {
- if (--io->pages[i]->p_count == 0) {
- struct page *page = io->pages[i]->p_page;
-
- end_page_writeback(page);
- put_page(page);
- kmem_cache_free(io_page_cachep, io->pages[i]);
- }
- }
+ for (i = 0; i < io->num_io_pages; i++)
+ put_io_page(io->pages[i]);
io->num_io_pages = 0;
- iput(io->inode);
+ wq = to_ioend_wq(io->inode);
+ if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+ waitqueue_active(wq))
+ wake_up_all(wq);
kmem_cache_free(io_end_cachep, io);
}
@@ -142,8 +163,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
io = kmem_cache_alloc(io_end_cachep, flags);
if (io) {
memset(io, 0, sizeof(*io));
- io->inode = igrab(inode);
- BUG_ON(!io->inode);
+ atomic_inc(&EXT4_I(inode)->i_ioend_count);
+ io->inode = inode;
INIT_WORK(&io->work, ext4_end_io_work);
INIT_LIST_HEAD(&io->list);
}
@@ -171,35 +192,15 @@ static void ext4_end_bio(struct bio *bio, int error)
struct workqueue_struct *wq;
struct inode *inode;
unsigned long flags;
- ext4_fsblk_t err_block;
int i;
BUG_ON(!io_end);
- inode = io_end->inode;
bio->bi_private = NULL;
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- err_block = bio->bi_sector >> (inode->i_blkbits - 9);
bio_put(bio);
- if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
- pr_err("sb umounted, discard end_io request for inode %lu\n",
- io_end->inode->i_ino);
- ext4_free_io_end(io_end);
- return;
- }
-
- if (error) {
- io_end->flag |= EXT4_IO_END_ERROR;
- ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
- "(offset %llu size %ld starting block %llu)",
- inode->i_ino,
- (unsigned long long) io_end->offset,
- (long) io_end->size,
- (unsigned long long) err_block);
- }
-
for (i = 0; i < io_end->num_io_pages; i++) {
struct page *page = io_end->pages[i]->p_page;
struct buffer_head *bh, *head;
@@ -236,14 +237,6 @@ static void ext4_end_bio(struct bio *bio, int error)
} while (bh != head);
}
- if (--io_end->pages[i]->p_count == 0) {
- struct page *page = io_end->pages[i]->p_page;
-
- end_page_writeback(page);
- put_page(page);
- kmem_cache_free(io_page_cachep, io_end->pages[i]);
- }
-
/*
* If this is a partial write which happened to make
* all buffers uptodate then we can optimize away a
@@ -253,9 +246,22 @@ static void ext4_end_bio(struct bio *bio, int error)
*/
if (!partial_write)
SetPageUptodate(page);
- }
+ put_io_page(io_end->pages[i]);
+ }
io_end->num_io_pages = 0;
+ inode = io_end->inode;
+
+ if (error) {
+ io_end->flag |= EXT4_IO_END_ERROR;
+ ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+ "(offset %llu size %ld starting block %llu)",
+ inode->i_ino,
+ (unsigned long long) io_end->offset,
+ (long) io_end->size,
+ (unsigned long long)
+ bio->bi_sector >> (inode->i_blkbits - 9));
+ }
/* Add the io_end to per-inode completed io list*/
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +311,6 @@ static int io_submit_init(struct ext4_io_submit *io,
bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio;
- io_end->inode = inode;
io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
io->io_bio = bio;
@@ -360,7 +365,7 @@ submit_and_retry:
if ((io_end->num_io_pages == 0) ||
(io_end->pages[io_end->num_io_pages-1] != io_page)) {
io_end->pages[io_end->num_io_pages++] = io_page;
- io_page->p_count++;
+ atomic_inc(&io_page->p_count);
}
return 0;
}
@@ -389,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
return -ENOMEM;
}
io_page->p_page = page;
- io_page->p_count = 0;
+ atomic_set(&io_page->p_count, 1);
get_page(page);
for (bh = head = page_buffers(page), block_start = 0;
@@ -421,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* PageWriteback bit from the page to prevent the system from
* wedging later on.
*/
- if (io_page->p_count == 0) {
- put_page(page);
- end_page_writeback(page);
- kmem_cache_free(io_page_cachep, io_page);
- }
+ put_io_page(io_page);
return ret;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index dc963929de6..981c8477ada 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -232,6 +232,8 @@ static int setup_new_group_blocks(struct super_block *sb,
GFP_NOFS);
if (err)
goto exit_bh;
+ for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
+ ext4_set_bit(bit, bh->b_data);
ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
input->block_bitmap - start);
@@ -247,6 +249,9 @@ static int setup_new_group_blocks(struct super_block *sb,
err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
if (err)
goto exit_bh;
+ for (i = 0, bit = input->inode_table - start;
+ i < sbi->s_itb_per_group; i++, bit++)
+ ext4_set_bit(bit, bh->b_data);
if ((err = extend_or_restart_transaction(handle, 2, bh)))
goto exit_bh;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 40131b777af..fb15c9c0be7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -828,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
+ atomic_set(&ei->i_ioend_count, 0);
return &ei->vfs_inode;
}
+static int ext4_drop_inode(struct inode *inode)
+{
+ int drop = generic_drop_inode(inode);
+
+ trace_ext4_drop_inode(inode, drop);
+ return drop;
+}
+
static void ext4_destroy_inode(struct inode *inode)
{
+ ext4_ioend_wait(inode);
if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
ext4_msg(inode->i_sb, KERN_ERR,
"Inode %lu (%p): orphan list check failed!",
@@ -1016,6 +1026,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
!(def_mount_opts & EXT4_DEFM_NODELALLOC))
seq_puts(seq, ",nodelalloc");
+ if (test_opt(sb, MBLK_IO_SUBMIT))
+ seq_puts(seq, ",mblk_io_submit");
if (sbi->s_stripe)
seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
/*
@@ -1173,6 +1185,7 @@ static const struct super_operations ext4_sops = {
.destroy_inode = ext4_destroy_inode,
.write_inode = ext4_write_inode,
.dirty_inode = ext4_dirty_inode,
+ .drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
.put_super = ext4_put_super,
.sync_fs = ext4_sync_fs,
@@ -1186,7 +1199,6 @@ static const struct super_operations ext4_sops = {
.quota_write = ext4_quota_write,
#endif
.bdev_try_to_free_page = bdev_try_to_free_page,
- .trim_fs = ext4_trim_fs
};
static const struct super_operations ext4_nojournal_sops = {
@@ -1194,6 +1206,7 @@ static const struct super_operations ext4_nojournal_sops = {
.destroy_inode = ext4_destroy_inode,
.write_inode = ext4_write_inode,
.dirty_inode = ext4_dirty_inode,
+ .drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
.write_super = ext4_write_super,
.put_super = ext4_put_super,
@@ -1228,8 +1241,8 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
- Opt_stripe, Opt_delalloc, Opt_nodelalloc,
- Opt_block_validity, Opt_noblock_validity,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+ Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard,
@@ -1293,6 +1306,8 @@ static const match_table_t tokens = {
{Opt_resize, "resize"},
{Opt_delalloc, "delalloc"},
{Opt_nodelalloc, "nodelalloc"},
+ {Opt_mblk_io_submit, "mblk_io_submit"},
+ {Opt_nomblk_io_submit, "nomblk_io_submit"},
{Opt_block_validity, "block_validity"},
{Opt_noblock_validity, "noblock_validity"},
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1714,6 +1729,12 @@ set_qf_format:
case Opt_nodelalloc:
clear_opt(sbi->s_mount_opt, DELALLOC);
break;
+ case Opt_mblk_io_submit:
+ set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+ break;
+ case Opt_nomblk_io_submit:
+ clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+ break;
case Opt_stripe:
if (match_int(&args[0], &option))
return 0;
@@ -2699,7 +2720,6 @@ static int ext4_lazyinit_thread(void *arg)
struct ext4_li_request *elr;
unsigned long next_wakeup;
DEFINE_WAIT(wait);
- int ret;
BUG_ON(NULL == eli);
@@ -2723,13 +2743,12 @@ cont_thread:
elr = list_entry(pos, struct ext4_li_request,
lr_request);
- if (time_after_eq(jiffies, elr->lr_next_sched))
- ret = ext4_run_li_request(elr);
-
- if (ret) {
- ret = 0;
- ext4_remove_li_request(elr);
- continue;
+ if (time_after_eq(jiffies, elr->lr_next_sched)) {
+ if (ext4_run_li_request(elr) != 0) {
+ /* error, remove the lazy_init job */
+ ext4_remove_li_request(elr);
+ continue;
+ }
}
if (time_before(elr->lr_next_sched, next_wakeup))
@@ -2740,7 +2759,8 @@ cont_thread:
if (freezing(current))
refrigerator();
- if (time_after_eq(jiffies, next_wakeup)) {
+ if ((time_after_eq(jiffies, next_wakeup)) ||
+ (MAX_JIFFY_OFFSET == next_wakeup)) {
cond_resched();
continue;
}
@@ -2788,9 +2808,6 @@ static void ext4_clear_request_list(void)
struct ext4_li_request *elr;
mutex_lock(&ext4_li_info->li_list_mtx);
- if (list_empty(&ext4_li_info->li_request_list))
- return;
-
list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
elr = list_entry(pos, struct ext4_li_request,
lr_request);
@@ -3257,13 +3274,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* Test whether we have more sectors than will fit in sector_t,
* and whether the max offset is addressable by the page cache.
*/
- ret = generic_check_addressable(sb->s_blocksize_bits,
+ err = generic_check_addressable(sb->s_blocksize_bits,
ext4_blocks_count(es));
- if (ret) {
+ if (err) {
ext4_msg(sb, KERN_ERR, "filesystem"
" too large to mount safely on this system");
if (sizeof(sector_t) < 8)
ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+ ret = err;
goto failed_mount;
}
@@ -3348,6 +3366,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
+ ext4_count_free_blocks(sb));
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ }
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ }
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ }
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "insufficient memory");
+ goto failed_mount3;
+ }
+
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;
@@ -3446,22 +3482,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-no_journal:
- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
- if (!err)
- err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext4_count_free_inodes(sb));
- if (!err)
- err = percpu_counter_init(&sbi->s_dirs_counter,
- ext4_count_dirs(sb));
- if (!err)
- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
- if (err) {
- ext4_msg(sb, KERN_ERR, "insufficient memory");
- goto failed_mount_wq;
- }
+ /*
+ * The journal may have updated the bg summary counts, so we
+ * need to update the global counters.
+ */
+ percpu_counter_set(&sbi->s_freeblocks_counter,
+ ext4_count_free_blocks(sb));
+ percpu_counter_set(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ percpu_counter_set(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
+no_journal:
EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
if (!EXT4_SB(sb)->dio_unwritten_wq) {
printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3611,10 +3644,6 @@ failed_mount_wq:
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount3:
if (sbi->s_flex_groups) {
if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3622,6 +3651,10 @@ failed_mount3:
else
kfree(sbi->s_flex_groups);
}
+ percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -3949,13 +3982,11 @@ static int ext4_commit_super(struct super_block *sb, int sync)
else
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
- ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeblocks_counter));
- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
- es->s_free_inodes_count =
- cpu_to_le32(percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeinodes_counter));
+ ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeblocks_counter));
+ es->s_free_inodes_count =
+ cpu_to_le32(percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeinodes_counter));
sb->s_dirt = 0;
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
@@ -4556,12 +4587,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
static int ext4_quota_off(struct super_block *sb, int type)
{
- /* Force all delayed allocation blocks to be allocated */
- if (test_opt(sb, DELALLOC)) {
- down_read(&sb->s_umount);
+ /* Force all delayed allocation blocks to be allocated.
+ * Caller already holds s_umount sem */
+ if (test_opt(sb, DELALLOC))
sync_filesystem(sb);
- up_read(&sb->s_umount);
- }
return dquot_quota_off(sb, type);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c8224587123..8b984a2cebb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -13,6 +13,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <linux/compat.h>
static const struct file_operations fuse_direct_io_file_operations;
@@ -134,6 +135,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
void fuse_finish_open(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = get_fuse_conn(inode);
if (ff->open_flags & FOPEN_DIRECT_IO)
file->f_op = &fuse_direct_io_file_operations;
@@ -141,6 +143,15 @@ void fuse_finish_open(struct inode *inode, struct file *file)
invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
+ if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fc->lock);
+ fi->attr_version = ++fc->attr_version;
+ i_size_write(inode, 0);
+ spin_unlock(&fc->lock);
+ fuse_invalidate_attr(inode);
+ }
}
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -1618,6 +1629,58 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
}
/*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit. Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
+ size_t transferred, unsigned count,
+ bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+ if (count * sizeof(struct compat_iovec) == transferred) {
+ struct compat_iovec *ciov = src;
+ unsigned i;
+
+ /*
+ * With this interface a 32bit server cannot support
+ * non-compat (i.e. ones coming from 64bit apps) ioctl
+ * requests
+ */
+ if (!is_compat)
+ return -EINVAL;
+
+ for (i = 0; i < count; i++) {
+ dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+ dst[i].iov_len = ciov[i].iov_len;
+ }
+ return 0;
+ }
+#endif
+
+ if (count * sizeof(struct iovec) != transferred)
+ return -EIO;
+
+ memcpy(dst, src, transferred);
+ return 0;
+}
+
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+{
+ size_t n;
+ u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+
+ for (n = 0; n < count; n++) {
+ if (iov->iov_len > (size_t) max)
+ return -ENOMEM;
+ max -= iov->iov_len;
+ }
+ return 0;
+}
+
+/*
* For ioctls, there is no generic way to determine how much memory
* needs to be read and/or written. Furthermore, ioctls are allowed
* to dereference the passed pointer, so the parameter requires deep
@@ -1798,18 +1861,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
goto out;
- err = -EIO;
- if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
- goto out;
-
- /* okay, copy in iovs and retry */
vaddr = kmap_atomic(pages[0], KM_USER0);
- memcpy(page_address(iov_page), vaddr, transferred);
+ err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
+ transferred, in_iovs + out_iovs,
+ (flags & FUSE_IOCTL_COMPAT) != 0);
kunmap_atomic(vaddr, KM_USER0);
+ if (err)
+ goto out;
in_iov = page_address(iov_page);
out_iov = in_iov + in_iovs;
+ err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+ if (err)
+ goto out;
+
+ err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+ if (err)
+ goto out;
+
goto retry;
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c066d4e..3c4039d5eef 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
int metadata;
unsigned int revokes = 0;
int x;
- int error;
+ int error = 0;
if (!*top)
sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
if (metadata)
revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
- error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+ if (ip != GFS2_I(sdp->sd_rindex))
+ error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+ else if (!sdp->sd_rgrps)
+ error = gfs2_ri_update(ip);
+
if (error)
return error;
@@ -879,7 +883,8 @@ out_rg_gunlock:
out_rlist:
gfs2_rlist_free(&rlist);
out:
- gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+ if (ip != GFS2_I(sdp->sd_rindex))
+ gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
return error;
}
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 06d582732d3..5ab3839dfcb 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
struct gfs2_inum_host *inum)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- struct gfs2_holder i_gh;
struct inode *inode;
struct dentry *dentry;
- int error;
inode = gfs2_ilookup(sb, inum->no_addr);
if (inode) {
@@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
goto out_inode;
}
- error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
- LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
- if (error)
- return ERR_PTR(error);
-
- error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
- if (error)
- goto fail;
-
- inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
- if (IS_ERR(inode)) {
- error = PTR_ERR(inode);
- goto fail;
- }
-
- error = gfs2_inode_refresh(GFS2_I(inode));
- if (error) {
- iput(inode);
- goto fail;
- }
-
- /* Pick up the works we bypass in gfs2_inode_lookup */
- if (inode->i_state & I_NEW)
- gfs2_set_iop(inode);
-
- if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
- iput(inode);
- goto fail;
- }
-
- error = -EIO;
- if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
- iput(inode);
- goto fail;
- }
-
- gfs2_glock_dq_uninit(&i_gh);
+ inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
+ GFS2_BLKST_DINODE);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
out_inode:
dentry = d_obtain_alias(inode);
if (!IS_ERR(dentry))
dentry->d_op = &gfs2_dops;
return dentry;
-fail:
- gfs2_glock_dq_uninit(&i_gh);
- return ERR_PTR(error);
}
static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 87778857f09..08a8beb152e 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
spin_unlock(&gl->gl_spin);
}
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
- unsigned int req_state,
- unsigned int flags)
-{
- int ret = LM_OUT_ERROR;
-
- if (!sdp->sd_lockstruct.ls_ops->lm_lock)
- return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
- req_state, flags);
- return ret;
-}
-
/**
* do_xmote - Calls the DLM to change the state of a lock
* @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
- BUG_ON(gl->gl_state == target);
- BUG_ON(gl->gl_state == gl->gl_target);
+ GLOCK_BUG_ON(gl, gl->gl_state == target);
+ GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
glops->go_inval) {
set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
do_error(gl, 0); /* Fail queued try locks */
}
+ gl->gl_req = target;
spin_unlock(&gl->gl_spin);
if (glops->go_xmote_th)
glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
gl->gl_state == LM_ST_DEFERRED) &&
!(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
lck_flags |= LM_FLAG_TRY_1CB;
- ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
- if (!(ret & LM_OUT_ASYNC)) {
- finish_xmote(gl, ret);
+ if (sdp->sd_lockstruct.ls_ops->lm_lock) {
+ /* lock_dlm */
+ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+ GLOCK_BUG_ON(gl, ret);
+ } else { /* lock_nolock */
+ finish_xmote(gl, target);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
- } else {
- GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
}
+
spin_lock(&gl->gl_spin);
}
@@ -686,21 +674,20 @@ static void delete_work_func(struct work_struct *work)
{
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
struct gfs2_sbd *sdp = gl->gl_sbd;
- struct gfs2_inode *ip = NULL;
+ struct gfs2_inode *ip;
struct inode *inode;
- u64 no_addr = 0;
+ u64 no_addr = gl->gl_name.ln_number;
+
+ ip = gl->gl_object;
+ /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
- spin_lock(&gl->gl_spin);
- ip = (struct gfs2_inode *)gl->gl_object;
if (ip)
- no_addr = ip->i_no_addr;
- spin_unlock(&gl->gl_spin);
- if (ip) {
inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
- if (inode) {
- d_prune_aliases(inode);
- iput(inode);
- }
+ else
+ inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
+ if (inode && !IS_ERR(inode)) {
+ d_prune_aliases(inode);
+ iput(inode);
}
gfs2_glock_put(gl);
}
@@ -952,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
+
if (seq) {
struct gfs2_glock_iter *gi = seq->private;
vsprintf(gi->string, fmt, args);
seq_printf(seq, gi->string);
} else {
- printk(KERN_ERR " ");
- vprintk(fmt, args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_ERR " %pV", &vaf);
}
+
va_end(args);
}
@@ -1362,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
* @gl: Pointer to the glock
* @ret: The return value from the dlm
*
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
*/
void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ spin_lock(&gl->gl_spin);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
- spin_lock(&gl->gl_spin);
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
return;
}
- spin_unlock(&gl->gl_spin);
}
+
+ spin_unlock(&gl->gl_spin);
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ smp_wmb();
gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
@@ -1627,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
{
struct task_struct *gh_owner = NULL;
- char buffer[KSYM_SYMBOL_LEN];
char flags_buf[32];
- sprint_symbol(buffer, gh->gh_ip);
if (gh->gh_owner_pid)
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
- gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
- state2str(gh->gh_state),
- hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
- gh->gh_error,
- gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
- gh_owner ? gh_owner->comm : "(ended)", buffer);
+ gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+ state2str(gh->gh_state),
+ hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+ gh->gh_error,
+ gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+ gh_owner ? gh_owner->comm : "(ended)",
+ (void *)gh->gh_ip);
return 0;
}
@@ -1783,12 +1778,13 @@ int __init gfs2_glock_init(void)
}
#endif
- glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+ glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
WQ_HIGHPRI | WQ_FREEZEABLE, 0);
if (IS_ERR(glock_workqueue))
return PTR_ERR(glock_workqueue);
- gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
- WQ_FREEZEABLE, 0);
+ gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+ WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+ 0);
if (IS_ERR(gfs2_delete_workqueue)) {
destroy_workqueue(glock_workqueue);
return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d6d22..691851ceb61 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
#define GL_ASYNC 0x00000040
#define GL_EXACT 0x00000080
#define GL_SKIP 0x00000100
-#define GL_ATIME 0x00000200
#define GL_NOCACHE 0x00000400
/*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
*
* LM_OUT_ST_MASK
* Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
* LM_OUT_CANCELED
* The lock request was canceled.
*
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
*/
#define LM_OUT_ST_MASK 0x00000003
#define LM_OUT_CANCELED 0x00000008
-#define LM_OUT_ASYNC 0x00000080
-#define LM_OUT_ERROR 0x00000100
+#define LM_OUT_ERROR 0x00000004
/*
* lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
void (*lm_unmount) (struct gfs2_sbd *sdp);
void (*lm_withdraw) (struct gfs2_sbd *sdp);
void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
- unsigned int (*lm_lock) (struct gfs2_glock *gl,
- unsigned int req_state, unsigned int flags);
+ int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+ unsigned int flags);
void (*lm_cancel) (struct gfs2_glock *gl);
const match_table_t *lm_tokens;
};
-#define LM_FLAG_TRY 0x00000001
-#define LM_FLAG_TRY_1CB 0x00000002
-#define LM_FLAG_NOEXP 0x00000004
-#define LM_FLAG_ANY 0x00000008
-#define LM_FLAG_PRIORITY 0x00000010
-
-#define GL_ASYNC 0x00000040
-#define GL_EXACT 0x00000080
-#define GL_SKIP 0x00000100
-#define GL_NOCACHE 0x00000400
-
-#define GLR_TRYFAILED 13
-
extern struct workqueue_struct *gfs2_delete_workqueue;
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dcc04e..263561bf1a5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
if (gl->gl_state != LM_ST_UNLOCKED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
- flush_workqueue(gfs2_delete_workqueue);
gfs2_meta_syncfs(sdp);
gfs2_log_shutdown(sdp);
}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb49efc..8d3d2b4a0a7 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -207,12 +207,14 @@ struct gfs2_glock {
spinlock_t gl_spin;
- unsigned int gl_state;
- unsigned int gl_target;
- unsigned int gl_reply;
+ /* State fields protected by gl_spin */
+ unsigned int gl_state:2, /* Current state */
+ gl_target:2, /* Target state */
+ gl_demote_state:2, /* State requested by remote node */
+ gl_req:2, /* State in last dlm request */
+ gl_reply:8; /* Last reply from the dlm */
+
unsigned int gl_hash;
- unsigned int gl_req;
- unsigned int gl_demote_state; /* state requested by remote node */
unsigned long gl_demote_time; /* time of first demote request */
struct list_head gl_holders;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06370f8bd8c..14e682dbe8b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
}
-struct gfs2_skip_data {
- u64 no_addr;
- int skipped;
-};
-
-static int iget_skip_test(struct inode *inode, void *opaque)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_skip_data *data = opaque;
-
- if (ip->i_no_addr == data->no_addr) {
- if (inode->i_state & (I_FREEING|I_WILL_FREE)){
- data->skipped = 1;
- return 0;
- }
- return 1;
- }
- return 0;
-}
-
-static int iget_skip_set(struct inode *inode, void *opaque)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_skip_data *data = opaque;
-
- if (data->skipped)
- return 1;
- inode->i_ino = (unsigned long)(data->no_addr);
- ip->i_no_addr = data->no_addr;
- return 0;
-}
-
-static struct inode *gfs2_iget_skip(struct super_block *sb,
- u64 no_addr)
-{
- struct gfs2_skip_data data;
- unsigned long hash = (unsigned long)no_addr;
-
- data.no_addr = no_addr;
- data.skipped = 0;
- return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
-}
-
/**
* GFS2 lookup code fills in vfs inode contents based on info obtained
* from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -243,93 +200,54 @@ fail:
return ERR_PTR(error);
}
-/**
- * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
- * and try to reclaim it by doing iput.
- *
- * This function assumes no rgrp locks are currently held.
- *
- * @sb: The super block
- * no_addr: The inode number
- *
- */
-
-void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+ u64 *no_formal_ino, unsigned int blktype)
{
- struct gfs2_sbd *sdp;
- struct gfs2_inode *ip;
- struct gfs2_glock *io_gl = NULL;
- int error;
- struct gfs2_holder gh;
+ struct super_block *sb = sdp->sd_vfs;
+ struct gfs2_holder i_gh;
struct inode *inode;
+ int error;
- inode = gfs2_iget_skip(sb, no_addr);
-
- if (!inode)
- return;
-
- /* If it's not a new inode, someone's using it, so leave it alone. */
- if (!(inode->i_state & I_NEW)) {
- iput(inode);
- return;
- }
-
- ip = GFS2_I(inode);
- sdp = GFS2_SB(inode);
- ip->i_no_formal_ino = -1;
+ error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
+ LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (error)
+ return ERR_PTR(error);
- error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
- if (unlikely(error))
+ error = gfs2_check_blk_type(sdp, no_addr, blktype);
+ if (error)
goto fail;
- ip->i_gl->gl_object = ip;
-
- error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
- if (unlikely(error))
- goto fail_put;
- set_bit(GIF_INVALID, &ip->i_flags);
- error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
- &ip->i_iopen_gh);
- if (unlikely(error))
- goto fail_iopen;
+ inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
+ if (IS_ERR(inode))
+ goto fail;
- ip->i_iopen_gh.gh_gl->gl_object = ip;
- gfs2_glock_put(io_gl);
- io_gl = NULL;
+ error = gfs2_inode_refresh(GFS2_I(inode));
+ if (error)
+ goto fail_iput;
- inode->i_mode = DT2IF(DT_UNKNOWN);
+ /* Pick up the works we bypass in gfs2_inode_lookup */
+ if (inode->i_state & I_NEW)
+ gfs2_set_iop(inode);
- /*
- * We must read the inode in order to work out its type in
- * this case. Note that this doesn't happen often as we normally
- * know the type beforehand. This code path only occurs during
- * unlinked inode recovery (where it is safe to do this glock,
- * which is not true in the general case).
- */
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
- &gh);
- if (unlikely(error))
- goto fail_glock;
+ /* Two extra checks for NFS only */
+ if (no_formal_ino) {
+ error = -ESTALE;
+ if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
+ goto fail_iput;
- /* Inode is now uptodate */
- gfs2_glock_dq_uninit(&gh);
- gfs2_set_iop(inode);
+ error = -EIO;
+ if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
+ goto fail_iput;
- /* The iput will cause it to be deleted. */
- iput(inode);
- return;
+ error = 0;
+ }
-fail_glock:
- gfs2_glock_dq(&ip->i_iopen_gh);
-fail_iopen:
- if (io_gl)
- gfs2_glock_put(io_gl);
-fail_put:
- ip->i_gl->gl_object = NULL;
- gfs2_glock_put(ip->i_gl);
fail:
- iget_failed(inode);
- return;
+ gfs2_glock_dq_uninit(&i_gh);
+ return error ? ERR_PTR(error) : inode;
+fail_iput:
+ iput(inode);
+ goto fail;
}
static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -998,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
if (error)
return error;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
- }
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
-
- gfs2_assert_warn(GFS2_SB(inode), !error);
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6720d7d5fbc..d8499fadcc5 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,9 @@ err:
extern void gfs2_set_iop(struct inode *inode);
extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
u64 no_addr, u64 no_formal_ino);
-extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+ u64 *no_formal_ino,
+ unsigned int blktype);
extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425b45f..6e493aee28f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
return lkf;
}
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
- unsigned int req_state, unsigned int flags)
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+ unsigned int flags)
{
struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
- int error;
int req;
u32 lkf;
- gl->gl_req = req_state;
req = make_mode(req_state);
lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
* Submit the actual lock request.
*/
- error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
- GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
- if (error == -EAGAIN)
- return 0;
- if (error)
- return LM_OUT_ERROR;
- return LM_OUT_ASYNC;
+ return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+ GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
}
static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c..1db6b734322 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct buffer_head *dibh;
u32 ouid, ogid, nuid, ngid;
int error;
@@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (error)
goto out_gunlock_q;
- error = gfs2_meta_inode_buffer(ip, &dibh);
+ error = gfs2_setattr_simple(ip, attr);
if (error)
goto out_end_trans;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- int error;
-
- error = vmtruncate(inode, attr->ia_size);
- gfs2_assert_warn(sdp, !error);
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
-
if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
gfs2_quota_change(ip, -blocks, ouid, ogid);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 58a9b9998b4..a689901963d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
struct fs_disk_quota *fdq)
{
struct inode *inode = &ip->i_inode;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
struct address_space *mapping = inode->i_mapping;
unsigned long index = loc >> PAGE_CACHE_SHIFT;
unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
@@ -658,13 +659,17 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
qd->qd_qb.qb_value = qp->qu_value;
if (fdq) {
if (fdq->d_fieldmask & FS_DQ_BSOFT) {
- qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+ qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
qd->qd_qb.qb_warn = qp->qu_warn;
}
if (fdq->d_fieldmask & FS_DQ_BHARD) {
- qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+ qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
qd->qd_qb.qb_limit = qp->qu_limit;
}
+ if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+ qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+ qd->qd_qb.qb_value = qp->qu_value;
+ }
}
/* Write the quota into the quota file on disk */
@@ -1497,9 +1502,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
fdq->d_version = FS_DQUOT_VERSION;
fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
fdq->d_id = id;
- fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
- fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
- fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+ fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
+ fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
+ fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
gfs2_glock_dq_uninit(&q_gh);
out:
@@ -1508,7 +1513,7 @@ out:
}
/* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
struct fs_disk_quota *fdq)
@@ -1566,11 +1571,17 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
/* If nothing has changed, this is a no-op */
if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
- (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+ ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
fdq->d_fieldmask ^= FS_DQ_BSOFT;
+
if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
- (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+ ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
fdq->d_fieldmask ^= FS_DQ_BHARD;
+
+ if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+ ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+ fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+
if (fdq->d_fieldmask == 0)
goto out_i;
@@ -1619,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
.get_dqblk = gfs2_get_dqblk,
.set_dqblk = gfs2_set_dqblk,
};
-
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bef3ab6cf5c..7293ea27020 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
for (rgrps = 0;; rgrps++) {
loff_t pos = rgrps * sizeof(struct gfs2_rindex);
- if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+ if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
break;
error = gfs2_internal_read(ip, &ra_state, buf, &pos,
sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
* Returns: 0 on successful update, error code otherwise
*/
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
}
/**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct inode *inode = &ip->i_inode;
- struct file_ra_state ra_state;
- struct gfs2_rgrpd *rgd;
- unsigned int max_data = 0;
- int error;
-
- file_ra_state_init(&ra_state, inode->i_mapping);
- for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
- /* Ignore partials */
- if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
- i_size_read(inode))
- break;
- error = read_rindex_entry(ip, &ra_state);
- if (error) {
- clear_rgrpdi(sdp);
- return error;
- }
- }
- list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
- if (rgd->rd_data > max_data)
- max_data = rgd->rd_data;
- sdp->sd_max_rg_data = max_data;
-
- sdp->sd_rindex_uptodate = 1;
- return 0;
-}
-
-/**
* gfs2_rindex_hold - Grab a lock on the rindex
* @sdp: The GFS2 superblock
* @ri_gh: the glock holder
@@ -963,17 +923,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
* The inode, if one has been found, in inode.
*/
-static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
- u64 skip)
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
{
u32 goal = 0, block;
u64 no_addr;
struct gfs2_sbd *sdp = rgd->rd_sbd;
unsigned int n;
+ struct gfs2_glock *gl;
+ struct gfs2_inode *ip;
+ int error;
+ int found = 0;
- for(;;) {
- if (goal >= rgd->rd_data)
- break;
+ while (goal < rgd->rd_data) {
down_write(&sdp->sd_log_flush_lock);
n = 1;
block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -990,11 +951,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
if (no_addr == skip)
continue;
*last_unlinked = no_addr;
- return no_addr;
+
+ error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+ if (error)
+ continue;
+
+ /* If the inode is already in cache, we can ignore it here
+ * because the existing inode disposal code will deal with
+ * it when all refs have gone away. Accessing gl_object like
+ * this is not safe in general. Here it is ok because we do
+ * not dereference the pointer, and we only need an approx
+ * answer to whether it is NULL or not.
+ */
+ ip = gl->gl_object;
+
+ if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+ gfs2_glock_put(gl);
+ else
+ found++;
+
+ /* Limit reclaim to sensible number of tasks */
+ if (found > 2*NR_CPUS)
+ return;
}
rgd->rd_flags &= ~GFS2_RDF_CHECK;
- return 0;
+ return;
}
/**
@@ -1075,11 +1057,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
* Try to acquire rgrp in way which avoids contending with others.
*
* Returns: errno
- * unlinked: the block address of an unlinked block to be reclaimed
*/
-static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
- u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1089,7 +1069,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
int loops = 0;
int error, rg_locked;
- *unlinked = 0;
rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
while (rgd) {
@@ -1106,17 +1085,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
case 0:
if (try_rgrp_fit(rgd, al))
goto out;
- /* If the rg came in already locked, there's no
- way we can recover from a failed try_rgrp_unlink
- because that would require an iput which can only
- happen after the rgrp is unlocked. */
- if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
- *unlinked = try_rgrp_unlink(rgd, last_unlinked,
- ip->i_no_addr);
+ if (rgd->rd_flags & GFS2_RDF_CHECK)
+ try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (*unlinked)
- return -EAGAIN;
/* fall through */
case GLR_TRYFAILED:
rgd = recent_rgrp_next(rgd);
@@ -1145,13 +1117,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
case 0:
if (try_rgrp_fit(rgd, al))
goto out;
- if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
- *unlinked = try_rgrp_unlink(rgd, last_unlinked,
- ip->i_no_addr);
+ if (rgd->rd_flags & GFS2_RDF_CHECK)
+ try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (*unlinked)
- return -EAGAIN;
break;
case GLR_TRYFAILED:
@@ -1204,12 +1173,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al = ip->i_alloc;
int error = 0;
- u64 last_unlinked = NO_BLOCK, unlinked;
+ u64 last_unlinked = NO_BLOCK;
+ int tries = 0;
if (gfs2_assert_warn(sdp, al->al_requested))
return -EINVAL;
-try_again:
if (hold_rindex) {
/* We need to hold the rindex unless the inode we're using is
the rindex itself, in which case it's already held. */
@@ -1217,32 +1186,33 @@ try_again:
error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
else if (!sdp->sd_rgrps) /* We may not have the rindex read
in, so: */
- error = gfs2_ri_update_special(ip);
+ error = gfs2_ri_update(ip);
+ if (error)
+ return error;
}
- if (error)
- return error;
+try_again:
+ do {
+ error = get_local_rgrp(ip, &last_unlinked);
+ /* If there is no space, flushing the log may release some */
+ if (error) {
+ if (ip == GFS2_I(sdp->sd_rindex) &&
+ !sdp->sd_rindex_uptodate) {
+ error = gfs2_ri_update(ip);
+ if (error)
+ return error;
+ goto try_again;
+ }
+ gfs2_log_flush(sdp, NULL);
+ }
+ } while (error && tries++ < 3);
- /* Find an rgrp suitable for allocation. If it encounters any unlinked
- dinodes along the way, error will equal -EAGAIN and unlinked will
- contains it block address. We then need to look up that inode and
- try to free it, and try the allocation again. */
- error = get_local_rgrp(ip, &unlinked, &last_unlinked);
if (error) {
if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
gfs2_glock_dq_uninit(&al->al_ri_gh);
- if (error != -EAGAIN)
- return error;
-
- gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
- /* regardless of whether or not gfs2_process_unlinked_inode
- was successful, we don't want to repeat it again. */
- last_unlinked = unlinked;
- gfs2_log_flush(sdp, NULL);
- error = 0;
-
- goto try_again;
+ return error;
}
+
/* no error, so we have the rgrp set in the inode's allocation. */
al->al_file = file;
al->al_line = line;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c0466f9..50c2bb04369 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
extern void gfs2_inplace_release(struct gfs2_inode *ip);
+extern int gfs2_ri_update(struct gfs2_inode *ip);
extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07c8a..439b61c0326 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
{
- struct inode *inode = &ip->i_inode;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_ea_location el;
- struct buffer_head *dibh;
int error;
error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
if (error)
return error;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto out_trans_end;
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- int error;
-
- error = vmtruncate(inode, attr->ia_size);
- gfs2_assert_warn(GFS2_SB(inode), !error);
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
-
-out_trans_end:
+ error = gfs2_setattr_simple(ip, attr);
gfs2_trans_end(sdp);
return error;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d6cfac1f0a4..a5fe68189ee 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -932,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
*user = current_user();
if (user_shm_lock(size, *user)) {
- WARN_ONCE(1,
- "Using mlock ulimits for SHM_HUGETLB deprecated\n");
+ printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
} else {
*user = NULL;
return ERR_PTR(-EPERM);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index e92fdbb3bc3..d6cc1647662 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -6,7 +6,6 @@
#include <linux/syscalls.h>
#include <linux/mm.h>
-#include <linux/smp_lock.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fs.h>
@@ -530,41 +529,6 @@ static int ioctl_fsthaw(struct file *filp)
return thaw_super(sb);
}
-static int ioctl_fstrim(struct file *filp, void __user *argp)
-{
- struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
- struct fstrim_range range;
- int ret = 0;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- /* If filesystem doesn't support trim feature, return. */
- if (sb->s_op->trim_fs == NULL)
- return -EOPNOTSUPP;
-
- /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
- if (sb->s_bdev == NULL)
- return -EINVAL;
-
- if (argp == NULL) {
- range.start = 0;
- range.len = ULLONG_MAX;
- range.minlen = 0;
- } else if (copy_from_user(&range, argp, sizeof(range)))
- return -EFAULT;
-
- ret = sb->s_op->trim_fs(sb, &range);
- if (ret < 0)
- return ret;
-
- if ((argp != NULL) &&
- (copy_to_user(argp, &range, sizeof(range))))
- return -EFAULT;
-
- return 0;
-}
-
/*
* When you add any new common ioctls to the switches above and below
* please update compat_sys_ioctl() too.
@@ -615,10 +579,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
error = ioctl_fsthaw(filp);
break;
- case FITRIM:
- error = ioctl_fstrim(filp, argp);
- break;
-
case FS_IOC_FIEMAP:
return ioctl_fiemap(filp, arg);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc..7da2a06508e 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
}
ret = -ESRCH;
- /*
- * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic",
- * so we can't use rcu_read_lock(). See re-copy of ->ioprio
- * in copy_process().
- */
- read_lock(&tasklist_lock);
+ rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
@@ -153,7 +148,7 @@ free_uid:
ret = -EINVAL;
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return ret;
}
@@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
int ret = -ESRCH;
int tmpio;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
@@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
ret = -EINVAL;
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return ret;
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c590d155c09..f837ba95352 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
/* journal descriptor can store up to n blocks -bzzz */
journal->j_blocksize = blocksize;
+ journal->j_dev = bdev;
+ journal->j_fs_dev = fs_dev;
+ journal->j_blk_offset = start;
+ journal->j_maxlen = len;
+ bdevname(journal->j_dev, journal->j_devname);
+ p = journal->j_devname;
+ while ((p = strchr(p, '/')))
+ *p = '!';
jbd2_stats_proc_init(journal);
n = journal->j_blocksize / sizeof(journal_block_tag_t);
journal->j_wbufsize = n;
@@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
__func__);
goto out_err;
}
- journal->j_dev = bdev;
- journal->j_fs_dev = fs_dev;
- journal->j_blk_offset = start;
- journal->j_maxlen = len;
- bdevname(journal->j_dev, journal->j_devname);
- p = journal->j_devname;
- while ((p = strchr(p, '/')))
- *p = '!';
bh = __getblk(journal->j_dev, start, journal->j_blocksize);
if (!bh) {
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index d5bb86866e6..25509eb28fd 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,7 +14,6 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
-#include <linux/smp_lock.h>
#include <linux/kthread.h>
#define NLMDBG_FACILITY NLMDBG_CLIENT
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 47ea1e1925b..332c54cf75e 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,7 +7,6 @@
*/
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/errno.h>
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 25e21e4023b..ed0c59fe23c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -124,7 +124,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
continue;
if (host->h_server != ni->server)
continue;
- if (ni->server &&
+ if (ni->server && ni->src_len != 0 &&
!rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
continue;
@@ -167,6 +167,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
host->h_addrlen = ni->salen;
rpc_set_port(nlm_addr(host), 0);
memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+ host->h_srcaddrlen = ni->src_len;
host->h_version = ni->version;
host->h_proto = ni->protocol;
host->h_rpcclnt = NULL;
@@ -238,9 +239,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
const char *hostname,
int noresvport)
{
- const struct sockaddr source = {
- .sa_family = AF_UNSPEC,
- };
struct nlm_lookup_host_info ni = {
.server = 0,
.sap = sap,
@@ -249,8 +247,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
.version = version,
.hostname = hostname,
.hostname_len = strlen(hostname),
- .src_sap = &source,
- .src_len = sizeof(source),
.noresvport = noresvport,
};
@@ -357,7 +353,6 @@ nlm_bind_host(struct nlm_host *host)
.protocol = host->h_proto,
.address = nlm_addr(host),
.addrsize = host->h_addrlen,
- .saddress = nlm_srcaddr(host),
.timeout = &timeparms,
.servername = host->h_name,
.program = &nlm_program,
@@ -376,6 +371,8 @@ nlm_bind_host(struct nlm_host *host)
args.flags |= RPC_CLNT_CREATE_HARDRTRY;
if (host->h_noresvport)
args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+ if (host->h_srcaddrlen)
+ args.saddress = nlm_srcaddr(host);
clnt = rpc_create(&args);
if (!IS_ERR(clnt))
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a336e832475..38d26119245 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
#include <linux/types.h>
#include <linux/time.h>
-#include <linux/smp_lock.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index c462d346acb..ef5659b211e 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -25,7 +25,6 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/smp_lock.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/nlm.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index c3069f38d60..0caea5310ac 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
#include <linux/types.h>
#include <linux/time.h>
-#include <linux/smp_lock.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/share.h>
diff --git a/fs/locks.c b/fs/locks.c
index 65765cb6afe..8729347bcd1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -122,7 +122,6 @@
#include <linux/module.h>
#include <linux/security.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/rcupdate.h>
@@ -1504,9 +1503,8 @@ static int do_fcntl_delete_lease(struct file *filp)
static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
{
- struct file_lock *fl;
+ struct file_lock *fl, *ret;
struct fasync_struct *new;
- struct inode *inode = filp->f_path.dentry->d_inode;
int error;
fl = lease_alloc(filp, arg);
@@ -1518,13 +1516,16 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
locks_free_lock(fl);
return -ENOMEM;
}
+ ret = fl;
lock_flocks();
- error = __vfs_setlease(filp, arg, &fl);
+ error = __vfs_setlease(filp, arg, &ret);
if (error) {
unlock_flocks();
locks_free_lock(fl);
goto out_free_fasync;
}
+ if (ret != fl)
+ locks_free_lock(fl);
/*
* fasync_insert_entry() returns the old entry if any.
@@ -1532,17 +1533,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
* inserted it into the fasync list. Clear new so that
* we don't release it here.
*/
- if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new))
+ if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
new = NULL;
- if (error < 0) {
- /* remove lease just inserted by setlease */
- fl->fl_type = F_UNLCK | F_INPROGRESS;
- fl->fl_break_time = jiffies - 10;
- time_out_leases(inode);
- } else {
- error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
- }
+ error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
unlock_flocks();
out_free_fasync:
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index f46ee8b0e13..9da29706f91 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
super->s_journal_seg[i] = segno;
super->s_journal_ec[i] = ec;
logfs_set_segment_reserved(sb, segno);
- err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+ err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
BUG_ON(err); /* mempool should prevent this */
err = logfs_erase_segment(sb, segno, 1);
BUG_ON(err); /* FIXME: remount-ro would be nicer */
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index cd51a36b37f..57afd4a6fab 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -486,7 +486,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
/* dev_mtd.c */
#ifdef CONFIG_MTD
-int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
#else
static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
{
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6127baf0e18..ee99a9f5dfd 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode)
/* FIXME: transaction is part of logfs_block now. Is that enough? */
err = logfs_write_buf(master_inode, page, 0);
+ if (err)
+ move_page_to_inode(inode, page);
+
logfs_put_write_page(page);
return err;
}
diff --git a/fs/namei.c b/fs/namei.c
index 5362af9b737..4ff7ca53053 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1748,6 +1748,9 @@ struct file *do_filp_open(int dfd, const char *pathname,
if (!(open_flag & O_CREAT))
mode = 0;
+ /* Must never be set by userspace */
+ open_flag &= ~FMODE_NONOTIFY;
+
/*
* O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
* check for O_DSYNC if the need any syncing at all we enforce it's
diff --git a/fs/namespace.c b/fs/namespace.c
index 8a415c9c5e5..3dbfc072ec7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,7 +13,6 @@
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/percpu.h>
-#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/acct.h>
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index aac8832e919..f22b12e7d33 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -19,7 +19,6 @@
#include <linux/mm.h>
#include <asm/uaccess.h>
#include <asm/byteorder.h>
-#include <linux/smp_lock.h>
#include <linux/ncp_fs.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6c754f70c52..cb50aaf981d 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -17,7 +17,6 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
-#include <linux/smp_lock.h>
#include <linux/ncp_fs.h>
#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d290545aa0c..8fb93b604e7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -26,7 +26,6 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
-#include <linux/smp_lock.h>
#include <linux/vfs.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c2a1f9a155c..d40a547e337 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -17,7 +17,6 @@
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/highuid.h>
-#include <linux/smp_lock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index aeec017fe81..93a8b3bd69e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,7 +9,6 @@
#include <linux/completion.h>
#include <linux/ip.h>
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svcsock.h>
#include <linux/nfs_fs.h>
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 232a7eead33..1fd62fc49be 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -11,7 +11,6 @@
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/nfs4.h>
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 07ac3847e56..996dd8989a9 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -34,6 +34,7 @@
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
#include "delegation.h"
#include "iostat.h"
@@ -56,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
struct inode *, struct dentry *);
static int nfs_fsync_dir(struct file *, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static int nfs_readdir_clear_array(struct page*, gfp_t);
+static void nfs_readdir_clear_array(struct page*);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -82,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = {
.setattr = nfs_setattr,
};
-const struct address_space_operations nfs_dir_addr_space_ops = {
- .releasepage = nfs_readdir_clear_array,
+const struct address_space_operations nfs_dir_aops = {
+ .freepage = nfs_readdir_clear_array,
};
#ifdef CONFIG_NFS_V3
@@ -161,6 +162,7 @@ struct nfs_cache_array_entry {
u64 cookie;
u64 ino;
struct qstr string;
+ unsigned char d_type;
};
struct nfs_cache_array {
@@ -170,14 +172,13 @@ struct nfs_cache_array {
struct nfs_cache_array_entry array[0];
};
-#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
-
typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
typedef struct {
struct file *file;
struct page *page;
unsigned long page_index;
u64 *dir_cookie;
+ u64 last_cookie;
loff_t current_index;
decode_dirent_t decode;
@@ -194,9 +195,13 @@ typedef struct {
static
struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
{
+ void *ptr;
if (page == NULL)
return ERR_PTR(-EIO);
- return (struct nfs_cache_array *)kmap(page);
+ ptr = kmap(page);
+ if (ptr == NULL)
+ return ERR_PTR(-ENOMEM);
+ return ptr;
}
static
@@ -209,14 +214,15 @@ void nfs_readdir_release_array(struct page *page)
* we are freeing strings created by nfs_add_to_readdir_array()
*/
static
-int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+void nfs_readdir_clear_array(struct page *page)
{
- struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ struct nfs_cache_array *array;
int i;
+
+ array = kmap_atomic(page, KM_USER0);
for (i = 0; i < array->size; i++)
kfree(array->array[i].string.name);
- nfs_readdir_release_array(page);
- return 0;
+ kunmap_atomic(array, KM_USER0);
}
/*
@@ -231,6 +237,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
string->name = kmemdup(name, len, GFP_KERNEL);
if (string->name == NULL)
return -ENOMEM;
+ /*
+ * Avoid a kmemleak false positive. The pointer to the name is stored
+ * in a page cache page which kmemleak does not scan.
+ */
+ kmemleak_not_leak(string->name);
string->hash = full_name_hash(name, len);
return 0;
}
@@ -244,20 +255,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
if (IS_ERR(array))
return PTR_ERR(array);
- ret = -EIO;
- if (array->size >= MAX_READDIR_ARRAY)
- goto out;
cache_entry = &array->array[array->size];
+
+ /* Check that this entry lies within the page bounds */
+ ret = -ENOSPC;
+ if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+ goto out;
+
cache_entry->cookie = entry->prev_cookie;
cache_entry->ino = entry->ino;
+ cache_entry->d_type = entry->d_type;
ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
if (ret)
goto out;
array->last_cookie = entry->cookie;
- if (entry->eof == 1)
- array->eof_index = array->size;
array->size++;
+ if (entry->eof != 0)
+ array->eof_index = array->size;
out:
nfs_readdir_release_array(page);
return ret;
@@ -272,7 +287,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
if (diff < 0)
goto out_eof;
if (diff >= array->size) {
- if (array->eof_index > 0)
+ if (array->eof_index >= 0)
goto out_eof;
desc->current_index += array->size;
return -EAGAIN;
@@ -281,8 +296,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
index = (unsigned int)diff;
*desc->dir_cookie = array->array[index].cookie;
desc->cache_entry_index = index;
- if (index == array->eof_index)
- desc->eof = 1;
return 0;
out_eof:
desc->eof = 1;
@@ -296,17 +309,16 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
int status = -EAGAIN;
for (i = 0; i < array->size; i++) {
- if (i == array->eof_index) {
- desc->eof = 1;
- status = -EBADCOOKIE;
- }
if (array->array[i].cookie == *desc->dir_cookie) {
desc->cache_entry_index = i;
- status = 0;
- break;
+ return 0;
}
}
-
+ if (array->eof_index >= 0) {
+ status = -EBADCOOKIE;
+ if (*desc->dir_cookie == array->last_cookie)
+ desc->eof = 1;
+ }
return status;
}
@@ -314,10 +326,7 @@ static
int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
{
struct nfs_cache_array *array;
- int status = -EBADCOOKIE;
-
- if (desc->dir_cookie == NULL)
- goto out;
+ int status;
array = nfs_readdir_get_array(desc->page);
if (IS_ERR(array)) {
@@ -330,6 +339,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
else
status = nfs_readdir_search_for_cookie(array, desc);
+ if (status == -EAGAIN) {
+ desc->last_cookie = array->last_cookie;
+ desc->page_index++;
+ }
nfs_readdir_release_array(desc->page);
out:
return status;
@@ -381,13 +394,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x
static
int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
{
- struct nfs_inode *node;
if (dentry->d_inode == NULL)
goto different;
- node = NFS_I(dentry->d_inode);
- if (node->fh.size != entry->fh->size)
- goto different;
- if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
+ if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
goto different;
return 1;
different:
@@ -449,14 +458,15 @@ out:
/* Perform conversion from xdr to cache array */
static
-void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
void *xdr_page, struct page *page, unsigned int buflen)
{
struct xdr_stream stream;
struct xdr_buf buf;
__be32 *ptr = xdr_page;
- int status;
struct nfs_cache_array *array;
+ unsigned int count = 0;
+ int status;
buf.head->iov_base = xdr_page;
buf.head->iov_len = buflen;
@@ -471,21 +481,32 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e
do {
status = xdr_decode(desc, entry, &stream);
- if (status != 0)
+ if (status != 0) {
+ if (status == -EAGAIN)
+ status = 0;
break;
+ }
- if (nfs_readdir_add_to_array(entry, page) == -1)
- break;
- if (desc->plus == 1)
+ count++;
+
+ if (desc->plus != 0)
nfs_prime_dcache(desc->file->f_path.dentry, entry);
+
+ status = nfs_readdir_add_to_array(entry, page);
+ if (status != 0)
+ break;
} while (!entry->eof);
- if (status == -EBADCOOKIE && entry->eof) {
+ if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
array = nfs_readdir_get_array(page);
- array->eof_index = array->size - 1;
- status = 0;
- nfs_readdir_release_array(page);
+ if (!IS_ERR(array)) {
+ array->eof_index = array->size;
+ status = 0;
+ nfs_readdir_release_array(page);
+ } else
+ status = PTR_ERR(array);
}
+ return status;
}
static
@@ -537,11 +558,11 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
struct nfs_entry entry;
struct file *file = desc->file;
struct nfs_cache_array *array;
- int status = 0;
+ int status = -ENOMEM;
unsigned int array_size = ARRAY_SIZE(pages);
entry.prev_cookie = 0;
- entry.cookie = *desc->dir_cookie;
+ entry.cookie = desc->last_cookie;
entry.eof = 0;
entry.fh = nfs_alloc_fhandle();
entry.fattr = nfs_alloc_fattr();
@@ -549,6 +570,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
goto out;
array = nfs_readdir_get_array(page);
+ if (IS_ERR(array)) {
+ status = PTR_ERR(array);
+ goto out;
+ }
memset(array, 0, sizeof(struct nfs_cache_array));
array->eof_index = -1;
@@ -556,12 +581,19 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
if (!pages_ptr)
goto out_release_array;
do {
+ unsigned int pglen;
status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
if (status < 0)
break;
- nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
- } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
+ pglen = status;
+ status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
+ if (status < 0) {
+ if (status == -ENOSPC)
+ status = 0;
+ break;
+ }
+ } while (array->eof_index < 0);
nfs_readdir_free_large_page(pages_ptr, pages, array_size);
out_release_array:
@@ -582,8 +614,10 @@ static
int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
{
struct inode *inode = desc->file->f_path.dentry->d_inode;
+ int ret;
- if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
+ ret = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (ret < 0)
goto error;
SetPageUptodate(page);
@@ -595,12 +629,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
return 0;
error:
unlock_page(page);
- return -EIO;
+ return ret;
}
static
void cache_page_release(nfs_readdir_descriptor_t *desc)
{
+ if (!desc->page->mapping)
+ nfs_readdir_clear_array(desc->page);
page_cache_release(desc->page);
desc->page = NULL;
}
@@ -608,12 +644,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
static
struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
{
- struct page *page;
- page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+ return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
desc->page_index, (filler_t *)nfs_readdir_filler, desc);
- if (IS_ERR(page))
- desc->eof = 1;
- return page;
}
/*
@@ -629,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
return PTR_ERR(desc->page);
res = nfs_readdir_search_array(desc);
- if (res == 0)
- return 0;
- cache_page_release(desc);
+ if (res != 0)
+ cache_page_release(desc);
return res;
}
@@ -639,22 +670,18 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
static inline
int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
{
- int res = -EAGAIN;
+ int res;
- while (1) {
- res = find_cache_page(desc);
- if (res != -EAGAIN)
- break;
- desc->page_index++;
+ if (desc->page_index == 0) {
+ desc->current_index = 0;
+ desc->last_cookie = 0;
}
+ do {
+ res = find_cache_page(desc);
+ } while (res == -EAGAIN);
return res;
}
-static inline unsigned int dt_type(struct inode *inode)
-{
- return (inode->i_mode >> 12) & 15;
-}
-
/*
* Once we've found the start of the dirent within a page: fill 'er up...
*/
@@ -666,35 +693,35 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
int i = 0;
int res = 0;
struct nfs_cache_array *array = NULL;
- unsigned int d_type = DT_UNKNOWN;
- struct dentry *dentry = NULL;
array = nfs_readdir_get_array(desc->page);
+ if (IS_ERR(array)) {
+ res = PTR_ERR(array);
+ goto out;
+ }
for (i = desc->cache_entry_index; i < array->size; i++) {
- d_type = DT_UNKNOWN;
+ struct nfs_cache_array_entry *ent;
- res = filldir(dirent, array->array[i].string.name,
- array->array[i].string.len, file->f_pos,
- nfs_compat_user_ino64(array->array[i].ino), d_type);
- if (res < 0)
+ ent = &array->array[i];
+ if (filldir(dirent, ent->string.name, ent->string.len,
+ file->f_pos, nfs_compat_user_ino64(ent->ino),
+ ent->d_type) < 0) {
+ desc->eof = 1;
break;
+ }
file->f_pos++;
- desc->cache_entry_index = i;
if (i < (array->size-1))
*desc->dir_cookie = array->array[i+1].cookie;
else
*desc->dir_cookie = array->last_cookie;
- if (i == array->eof_index) {
- desc->eof = 1;
- break;
- }
}
+ if (array->eof_index >= 0)
+ desc->eof = 1;
nfs_readdir_release_array(desc->page);
+out:
cache_page_release(desc);
- if (dentry != NULL)
- dput(dentry);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
(unsigned long long)*desc->dir_cookie, res);
return res;
@@ -729,13 +756,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
goto out;
}
- if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
- status = -EIO;
- goto out_release;
- }
-
desc->page_index = 0;
+ desc->last_cookie = *desc->dir_cookie;
desc->page = page;
+
+ status = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (status < 0)
+ goto out_release;
+
status = nfs_do_filldir(desc, dirent, filldir);
out:
@@ -757,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct inode *inode = dentry->d_inode;
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
- int res = -ENOMEM;
+ int res;
dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -782,18 +810,18 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (res < 0)
goto out;
- while (desc->eof != 1) {
+ do {
res = readdir_search_pagecache(desc);
if (res == -EBADCOOKIE) {
+ res = 0;
/* This means either end of directory */
if (*desc->dir_cookie && desc->eof == 0) {
/* Or that the server has 'lost' a cookie */
res = uncached_readdir(desc, dirent, filldir);
- if (res >= 0)
+ if (res == 0)
continue;
}
- res = 0;
break;
}
if (res == -ETOOSMALL && desc->plus) {
@@ -808,11 +836,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
break;
res = nfs_do_filldir(desc, dirent, filldir);
- if (res < 0) {
- res = 0;
+ if (res < 0)
break;
- }
- }
+ } while (!desc->eof);
out:
nfs_unblock_sillyrename(dentry);
if (res > 0)
@@ -1345,12 +1371,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
res = NULL;
goto out;
/* This turned out not to be a regular file */
- case -EISDIR:
case -ENOTDIR:
goto no_open;
case -ELOOP:
if (!(nd->intent.open.flags & O_NOFOLLOW))
goto no_open;
+ /* case -EISDIR: */
/* case -EINVAL: */
default:
res = ERR_CAST(inode);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 84d3c8b9020..e6ace0d93c7 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -867,7 +867,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
goto out;
nfs_alloc_commit_data(dreq);
- if (dreq->commit_data == NULL || count < wsize)
+ if (dreq->commit_data == NULL || count <= wsize)
sync = NFS_FILE_SYNC;
dreq->inode = inode;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 60677f9f131..7bf029ef408 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
struct inode *inode = filp->f_mapping->host;
int status = 0;
+ unsigned int saved_type = fl->fl_type;
/* Try local locking first */
posix_test_lock(filp, fl);
@@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
/* found a conflict */
goto out;
}
+ fl->fl_type = saved_type;
if (nfs_have_delegation(inode, FMODE_READ))
goto out_noconflict;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 314f5716460..e67e31c7341 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
inode->i_fop = &nfs_dir_operations;
+ inode->i_data.a_ops = &nfs_dir_aops;
if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
/* Deal with crossing mountpoints */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index db08ff3ff45..e6356b750b7 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -362,6 +362,15 @@ unsigned int nfs_page_length(struct page *page)
}
/*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+ return (mode >> 12) & 15;
+}
+
+/*
* Determine the number of pages in an array of length 'len' and
* with a base offset of 'base'
*/
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index eceafe74f47..4f981f1f668 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -505,13 +505,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
static struct rpc_version mnt_version1 = {
.number = 1,
- .nrprocs = 2,
+ .nrprocs = ARRAY_SIZE(mnt_procedures),
.procs = mnt_procedures,
};
static struct rpc_version mnt_version3 = {
.number = 3,
- .nrprocs = 2,
+ .nrprocs = ARRAY_SIZE(mnt3_procedures),
.procs = mnt3_procedures,
};
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index e6bf45710cc..5914a1911c9 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -423,7 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
struct page **page;
size_t hdrlen;
unsigned int pglen, recvd;
- int status, nr = 0;
+ int status;
if ((status = ntohl(*p++)))
return nfs_stat_to_errno(status);
@@ -443,7 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
if (pglen > recvd)
pglen = recvd;
page = rcvbuf->pages;
- return nr;
+ return pglen;
}
static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -485,6 +485,8 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se
entry->prev_cookie = entry->cookie;
entry->cookie = ntohl(*p++);
+ entry->d_type = DT_UNKNOWN;
+
p = xdr_inline_peek(xdr, 8);
if (p != NULL)
entry->eof = !p[0] && p[1];
@@ -495,7 +497,7 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se
out_overflow:
print_overflow_msg(__func__, xdr);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EAGAIN);
}
/*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d9a5e832c25..f6cc60f06da 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -555,7 +555,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
struct page **page;
size_t hdrlen;
u32 recvd, pglen;
- int status, nr = 0;
+ int status;
status = ntohl(*p++);
/* Decode post_op_attrs */
@@ -586,7 +586,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
pglen = recvd;
page = rcvbuf->pages;
- return nr;
+ return pglen;
}
__be32 *
@@ -622,11 +622,13 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s
entry->prev_cookie = entry->cookie;
p = xdr_decode_hyper(p, &entry->cookie);
+ entry->d_type = DT_UNKNOWN;
if (plus) {
entry->fattr->valid = 0;
p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
if (IS_ERR(p))
goto out_overflow_exit;
+ entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
/* In fact, a post_op_fh3: */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
@@ -656,7 +658,7 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s
out_overflow:
print_overflow_msg(__func__, xdr);
out_overflow_exit:
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EAGAIN);
}
/*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0f24cdf2cb1..4435e5e1f90 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2852,8 +2852,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
res.pgbase = args.pgbase;
status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
- if (status == 0)
+ if (status >= 0) {
memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+ status += args.pgbase;
+ }
nfs_invalidate_atime(dir);
@@ -3359,6 +3361,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
ret = nfs_revalidate_inode(server, inode);
if (ret < 0)
return ret;
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
ret = nfs4_read_cached_acl(inode, buf, buflen);
if (ret != -ENOENT)
return ret;
@@ -3387,6 +3391,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
nfs_inode_return_delegation(inode);
buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+ /*
+ * Acl update can result in inode attribute update.
+ * so mark the attribute cache invalid.
+ */
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+ spin_unlock(&inode->i_lock);
nfs_access_zap_cache(inode);
nfs_zap_acl_cache(inode);
return ret;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f313c4cce7e..9f1826b012e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4518,7 +4518,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
xdr_read_pages(xdr, pglen);
- return 0;
+ return pglen;
}
static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -6208,6 +6208,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
entry->ino = entry->fattr->fileid;
+ entry->d_type = DT_UNKNOWN;
+ if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
+ entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
if (verify_attr_len(xdr, p, len) < 0)
goto out_overflow;
@@ -6221,7 +6225,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
out_overflow:
print_overflow_msg(__func__, xdr);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EAGAIN);
}
/*
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 137b549e63d..b68536cc904 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -115,7 +115,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
{
if (!nfs_lock_request_dontget(req))
return 0;
- if (req->wb_page != NULL)
+ if (test_bit(PG_MAPPED, &req->wb_flags))
radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
return 1;
}
@@ -125,7 +125,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
*/
void nfs_clear_page_tag_locked(struct nfs_page *req)
{
- if (req->wb_page != NULL) {
+ if (test_bit(PG_MAPPED, &req->wb_flags)) {
struct inode *inode = req->wb_context->path.dentry->d_inode;
struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e4b62c6f5a6..aedcaa7f291 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req)
(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
req->wb_bytes,
(long long)req_offset(req));
- nfs_clear_request(req);
nfs_release_request(req);
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0a42e8f4adc..4100630c9a5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -39,7 +39,6 @@
#include <linux/nfs_mount.h>
#include <linux/nfs4_mount.h>
#include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
#include <linux/mnt_namespace.h>
@@ -67,6 +66,12 @@
#define NFSDBG_FACILITY NFSDBG_VFS
+#ifdef CONFIG_NFS_V3
+#define NFS_DEFAULT_VERSION 3
+#else
+#define NFS_DEFAULT_VERSION 2
+#endif
+
enum {
/* Mount options that take no arguments */
Opt_soft, Opt_hard,
@@ -1064,12 +1069,10 @@ static int nfs_parse_mount_options(char *raw,
mnt->flags |= NFS_MOUNT_VER3;
mnt->version = 3;
break;
-#ifdef CONFIG_NFS_V4
case Opt_v4:
mnt->flags &= ~NFS_MOUNT_VER3;
mnt->version = 4;
break;
-#endif
case Opt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1281,12 +1284,10 @@ static int nfs_parse_mount_options(char *raw,
mnt->flags |= NFS_MOUNT_VER3;
mnt->version = 3;
break;
-#ifdef CONFIG_NFS_V4
case NFS4_VERSION:
mnt->flags &= ~NFS_MOUNT_VER3;
mnt->version = 4;
break;
-#endif
default:
goto out_invalid_value;
}
@@ -2277,7 +2278,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
};
int error = -ENOMEM;
- data = nfs_alloc_parsed_mount_data(3);
+ data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
mntfh = nfs_alloc_fhandle();
if (data == NULL || mntfh == NULL)
goto out_free_fh;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4c14c17a527..10d648ea128 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
if (nfs_have_delegation(inode, FMODE_WRITE))
nfsi->change_attr++;
}
+ set_bit(PG_MAPPED, &req->wb_flags);
SetPagePrivate(req->wb_page);
set_page_private(req->wb_page, (unsigned long)req);
nfsi->npages++;
@@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
spin_lock(&inode->i_lock);
set_page_private(req->wb_page, 0);
ClearPagePrivate(req->wb_page);
+ clear_bit(PG_MAPPED, &req->wb_flags);
radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
nfsi->npages--;
if (!nfsi->npages) {
@@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
iput(inode);
} else
spin_unlock(&inode->i_lock);
- nfs_clear_request(req);
nfs_release_request(req);
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0af2a..7e84a852cda 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
&fhp->fh_post_attr);
fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
- if (err)
+ if (err) {
fhp->fh_post_saved = 0;
- else
+ /* Grab the ctime anyway - set_change_info might use it */
+ fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
+ } else
fhp->fh_post_saved = 1;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f1e5ec6b510..116cab970e0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -673,16 +673,17 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
spin_unlock(&clp->cl_lock);
}
-static void nfsd4_register_conn(struct nfsd4_conn *conn)
+static int nfsd4_register_conn(struct nfsd4_conn *conn)
{
conn->cn_xpt_user.callback = nfsd4_conn_lost;
- register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
+ return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
}
static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
{
struct nfsd4_conn *conn;
u32 flags = NFS4_CDFC4_FORE;
+ int ret;
if (ses->se_flags & SESSION4_BACK_CHAN)
flags |= NFS4_CDFC4_BACK;
@@ -690,7 +691,10 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
if (!conn)
return nfserr_jukebox;
nfsd4_hash_conn(conn, ses);
- nfsd4_register_conn(conn);
+ ret = nfsd4_register_conn(conn);
+ if (ret)
+ /* oops; xprt is already down: */
+ nfsd4_conn_lost(&conn->cn_xpt_user);
return nfs_ok;
}
@@ -1644,6 +1648,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
{
struct nfs4_client *clp = ses->se_client;
struct nfsd4_conn *c;
+ int ret;
spin_lock(&clp->cl_lock);
c = __nfsd4_find_conn(new->cn_xprt, ses);
@@ -1654,7 +1659,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
}
__nfsd4_hash_conn(new, ses);
spin_unlock(&clp->cl_lock);
- nfsd4_register_conn(new);
+ ret = nfsd4_register_conn(new);
+ if (ret)
+ /* oops; xprt is already down: */
+ nfsd4_conn_lost(&new->cn_xpt_user);
return;
}
@@ -2254,7 +2262,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
* Spawn a thread to perform a recall on the delegation represented
* by the lease (file_lock)
*
- * Called from break_lease() with lock_kernel() held.
+ * Called from break_lease() with lock_flocks() held.
* Note: we assume break_lease will only call this *once* for any given
* lease.
*/
@@ -2278,7 +2286,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
spin_unlock(&recall_lock);
- /* only place dl_time is set. protected by lock_kernel*/
+ /* only place dl_time is set. protected by lock_flocks*/
dp->dl_time = get_seconds();
/*
@@ -2295,7 +2303,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
/*
* The file_lock is being reapd.
*
- * Called by locks_free_lock() with lock_kernel() held.
+ * Called by locks_free_lock() with lock_flocks() held.
*/
static
void nfsd_release_deleg_cb(struct file_lock *fl)
@@ -2310,7 +2318,7 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
}
/*
- * Called from setlease() with lock_kernel() held
+ * Called from setlease() with lock_flocks() held
*/
static
int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4d476ff08ae..60fce3dc5cb 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -484,18 +484,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
static inline void
set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
{
- BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
- cinfo->atomic = 1;
+ BUG_ON(!fhp->fh_pre_saved);
+ cinfo->atomic = fhp->fh_post_saved;
cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
- if (cinfo->change_supported) {
- cinfo->before_change = fhp->fh_pre_change;
- cinfo->after_change = fhp->fh_post_change;
- } else {
- cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
- cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
- cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
- cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
- }
+
+ cinfo->before_change = fhp->fh_pre_change;
+ cinfo->after_change = fhp->fh_post_change;
+ cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+ cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+ cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+ cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+
}
int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 49c844dab33..59e5fe742f7 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
* the device at this point.
*
* To prevent nilfs_dat_translate() from returning the
- * uncommited block number, this makes a copy of the entry
+ * uncommitted block number, this makes a copy of the entry
* buffer and redirects nilfs_dat_translate() to the copy.
*/
if (!buffer_nilfs_redirected(entry_bh)) {
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 33ad25ddd5c..caf9a6a3fb5 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
int nilfs_init_gcinode(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
- struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
inode->i_mode = S_IFREG;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode)
ii->i_flags = 0;
nilfs_bmap_init_gc(ii->i_bmap);
- /*
- * Add the inode to GC inode list. Garbage Collection
- * is serialized and no two processes manipulate the
- * list simultaneously.
- */
- igrab(inode);
- list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
-
return 0;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 3e90f86d5bf..b185e937a33 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -337,6 +337,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
struct nilfs_argv *argv, void *buf)
{
size_t nmembs = argv->v_nmembs;
+ struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
struct inode *inode;
struct nilfs_vdesc *vdesc;
struct buffer_head *bh, *n;
@@ -349,10 +350,21 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
ino = vdesc->vd_ino;
cno = vdesc->vd_cno;
inode = nilfs_iget_for_gc(sb, ino, cno);
- if (unlikely(inode == NULL)) {
- ret = -ENOMEM;
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
goto failed;
}
+ if (list_empty(&NILFS_I(inode)->i_dirty)) {
+ /*
+ * Add the inode to GC inode list. Garbage Collection
+ * is serialized and no two processes manipulate the
+ * list simultaneously.
+ */
+ igrab(inode);
+ list_add(&NILFS_I(inode)->i_dirty,
+ &nilfs->ns_gc_inodes);
+ }
+
do {
ret = nilfs_ioctl_move_inode_block(inode, vdesc,
&buffers);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index b04f88eed09..f35794b97e8 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- wait_event(group->fanotify_data.access_waitq, event->response);
+ wait_event(group->fanotify_data.access_waitq, event->response ||
+ atomic_read(&group->fanotify_data.bypass_perm));
+
+ if (!event->response) /* bypass_perm set */
+ return 0;
/* userspace responded, convert to something usable */
spin_lock(&event->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 063224812b7..8b61220cffc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -106,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
return client_fd;
}
-static ssize_t fill_event_metadata(struct fsnotify_group *group,
+static int fill_event_metadata(struct fsnotify_group *group,
struct fanotify_event_metadata *metadata,
struct fsnotify_event *event)
{
+ int ret = 0;
+
pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
group, metadata, event);
metadata->event_len = FAN_EVENT_METADATA_LEN;
+ metadata->metadata_len = FAN_EVENT_METADATA_LEN;
metadata->vers = FANOTIFY_METADATA_VERSION;
metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
metadata->pid = pid_vnr(event->tgid);
- metadata->fd = create_fd(group, event);
+ if (unlikely(event->mask & FAN_Q_OVERFLOW))
+ metadata->fd = FAN_NOFD;
+ else {
+ metadata->fd = create_fd(group, event);
+ if (metadata->fd < 0)
+ ret = metadata->fd;
+ }
- return metadata->fd;
+ return ret;
}
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -200,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
mutex_lock(&group->fanotify_data.access_mutex);
- if (group->fanotify_data.bypass_perm) {
+ if (atomic_read(&group->fanotify_data.bypass_perm)) {
mutex_unlock(&group->fanotify_data.access_mutex);
kmem_cache_free(fanotify_response_event_cache, re);
event->response = FAN_ALLOW;
@@ -257,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- fd = fill_event_metadata(group, &fanotify_event_metadata, event);
- if (fd < 0)
- return fd;
+ ret = fill_event_metadata(group, &fanotify_event_metadata, event);
+ if (ret < 0)
+ goto out;
+ fd = fanotify_event_metadata.fd;
ret = prepare_for_access_response(group, event, fd);
if (ret)
goto out_close_fd;
ret = -EFAULT;
- if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
+ if (copy_to_user(buf, &fanotify_event_metadata,
+ fanotify_event_metadata.event_len))
goto out_kill_access_response;
- return FAN_EVENT_METADATA_LEN;
+ return fanotify_event_metadata.event_len;
out_kill_access_response:
remove_access_response(group, event, fd);
out_close_fd:
- sys_close(fd);
+ if (fd != FAN_NOFD)
+ sys_close(fd);
+out:
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ if (event->mask & FAN_ALL_PERM_EVENTS) {
+ event->response = FAN_DENY;
+ wake_up(&group->fanotify_data.access_waitq);
+ }
+#endif
return ret;
}
@@ -382,7 +401,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
mutex_lock(&group->fanotify_data.access_mutex);
- group->fanotify_data.bypass_perm = true;
+ atomic_inc(&group->fanotify_data.bypass_perm);
list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
@@ -586,11 +605,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
{
struct fsnotify_mark *fsn_mark;
__u32 added;
+ int ret = 0;
fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
if (!fsn_mark) {
- int ret;
-
if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
return -ENOSPC;
@@ -600,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
fsnotify_init_mark(fsn_mark, fanotify_free_mark);
ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
- if (ret) {
- fanotify_free_mark(fsn_mark);
- return ret;
- }
+ if (ret)
+ goto err;
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- fsnotify_put_mark(fsn_mark);
+
if (added & ~mnt->mnt_fsnotify_mask)
fsnotify_recalc_vfsmount_mask(mnt);
-
- return 0;
+err:
+ fsnotify_put_mark(fsn_mark);
+ return ret;
}
static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -619,6 +636,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
{
struct fsnotify_mark *fsn_mark;
__u32 added;
+ int ret = 0;
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -634,8 +652,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
fsn_mark = fsnotify_find_inode_mark(group, inode);
if (!fsn_mark) {
- int ret;
-
if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
return -ENOSPC;
@@ -645,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
fsnotify_init_mark(fsn_mark, fanotify_free_mark);
ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
- if (ret) {
- fanotify_free_mark(fsn_mark);
- return ret;
- }
+ if (ret)
+ goto err;
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- fsnotify_put_mark(fsn_mark);
+
if (added & ~inode->i_fsnotify_mask)
fsnotify_recalc_inode_mask(inode);
- return 0;
+err:
+ fsnotify_put_mark(fsn_mark);
+ return ret;
}
/* fanotify syscalls */
@@ -687,8 +703,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
/* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
- if (IS_ERR(group))
+ if (IS_ERR(group)) {
+ free_uid(user);
return PTR_ERR(group);
+ }
group->fanotify_data.user = user;
atomic_inc(&user->fanotify_listeners);
@@ -698,6 +716,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
mutex_init(&group->fanotify_data.access_mutex);
init_waitqueue_head(&group->fanotify_data.access_waitq);
INIT_LIST_HEAD(&group->fanotify_data.access_list);
+ atomic_set(&group->fanotify_data.bypass_perm, 0);
#endif
switch (flags & FAN_ALL_CLASS_BITS) {
case FAN_CLASS_NOTIF:
@@ -764,8 +783,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
if (flags & ~FAN_ALL_MARK_FLAGS)
return -EINVAL;
switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
- case FAN_MARK_ADD:
+ case FAN_MARK_ADD: /* fallthrough */
case FAN_MARK_REMOVE:
+ if (!mask)
+ return -EINVAL;
case FAN_MARK_FLUSH:
break;
default:
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 444c305a468..4cd5d5d78f9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
if (ret >= 0)
return ret;
+ fsnotify_put_group(group);
atomic_dec(&user->inotify_devs);
out_free_uid:
free_uid(user);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f1e962cb3b7..0d7c5540ad6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -573,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+ if (ocfs2_iocb_is_sem_locked(iocb)) {
+ up_read(&inode->i_alloc_sem);
+ ocfs2_iocb_clear_sem_locked(iocb);
+ }
+
ocfs2_iocb_clear_rw_locked(iocb);
level = ocfs2_iocb_rw_locked_level(iocb);
- if (!level)
- up_read(&inode->i_alloc_sem);
ocfs2_rw_unlock(inode, level);
if (is_async)
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 76bfdfda691..eceb456037c 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -68,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
else
clear_bit(1, (unsigned long *)&iocb->private);
}
+
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication bewteen
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+ OCFS2_IOCB_RW_LOCK = 0,
+ OCFS2_IOCB_RW_LOCK_LEVEL,
+ OCFS2_IOCB_SEM,
+ OCFS2_IOCB_NUM_LOCKS
+};
+
#define ocfs2_iocb_clear_rw_locked(iocb) \
- clear_bit(0, (unsigned long *)&iocb->private)
+ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
#define ocfs2_iocb_rw_locked_level(iocb) \
- test_bit(1, (unsigned long *)&iocb->private)
+ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+ set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+ clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+ test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e2..9f26ac9be2a 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1964,8 +1964,10 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
if (reg == NULL)
return ERR_PTR(-ENOMEM);
- if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
+ if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
+ ret = -ENAMETOOLONG;
+ goto free;
+ }
spin_lock(&o2hb_live_lock);
reg->hr_region_num = 0;
@@ -1974,7 +1976,8 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
O2NM_MAX_REGIONS);
if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
spin_unlock(&o2hb_live_lock);
- return ERR_PTR(-EFBIG);
+ ret = -EFBIG;
+ goto free;
}
set_bit(reg->hr_region_num, o2hb_region_bitmap);
}
@@ -1986,10 +1989,13 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
if (ret) {
config_item_put(&reg->hr_item);
- return ERR_PTR(ret);
+ goto free;
}
return &reg->hr_item;
+free:
+ kfree(reg);
+ return ERR_PTR(ret);
}
static void o2hb_heartbeat_group_drop_item(struct config_group *group,
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index c7fba396392..6c61771469a 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(QUOTA),
define_mask(REFCOUNT),
define_mask(BASTS),
+ define_mask(RESERVATIONS),
+ define_mask(CLUSTER),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
- define_mask(RESERVATIONS),
};
static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index ea2ed9f56c9..34d6544357d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -81,7 +81,7 @@
#include <linux/sched.h>
/* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update mlog.c! */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
#define ML_EXIT 0x0000000000000002ULL /* func call exit */
#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
@@ -114,13 +114,14 @@
#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
-#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */
+#define ML_BASTS 0x0000000100000000ULL /* dlmglue asts and basts */
+#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER 0x0000000400000000ULL /* cluster stack */
+
/* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
-#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
-#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */
+#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */
+#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */
+#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index edaded48e7e..895532ac4d9 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -476,7 +476,6 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
out:
iput(inode);
- ocfs2_dentry_attach_gen(dentry);
}
/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c49f6de0e7a..d417b3f9b0c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
di->i_dx_root = cpu_to_le64(dr_blkno);
+ spin_lock(&OCFS2_I(dir)->ip_lock);
OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(dir)->ip_lock);
ocfs2_journal_dirty(handle, di_bh);
@@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
goto out_commit;
}
+ spin_lock(&OCFS2_I(dir)->ip_lock);
OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(dir)->ip_lock);
di->i_dx_root = cpu_to_le64(0ULL);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 58a93b95373..cc2aaa96cfe 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -959,7 +959,7 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
r += O2HB_MAX_REGION_NAME_LEN;
}
- local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+ local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
if (!local) {
status = -ENOMEM;
goto bail;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f564b0e5f80..59f0f6bdfc6 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
*/
static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
- int *numlocks)
+ int *numlocks,
+ int *hasrefs)
{
int ret;
int i;
@@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
assert_spin_locked(&res->spinlock);
+ *numlocks = 0;
+ *hasrefs = 0;
+
ret = -EINVAL;
if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
mlog(0, "cannot migrate lockres with unknown owner!\n");
@@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
}
*numlocks = count;
- mlog(0, "migrateable lockres having %d locks\n", *numlocks);
+
+ count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (count < O2NM_MAX_NODES)
+ *hasrefs = 1;
+
+ mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
leave:
return ret;
@@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
const char *name;
unsigned int namelen;
int mle_added = 0;
- int numlocks;
+ int numlocks, hasrefs;
int wake = 0;
if (!dlm_grab(dlm))
@@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
name = res->lockname.name;
namelen = res->lockname.len;
- mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+ mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
/*
* ensure this lockres is a proper candidate for migration
*/
spin_lock(&res->spinlock);
- ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+ ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
if (ret < 0) {
spin_unlock(&res->spinlock);
goto leave;
@@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_unlock(&res->spinlock);
/* no work to do */
- if (numlocks == 0) {
- mlog(0, "no locks were found on this lockres! done!\n");
+ if (numlocks == 0 && !hasrefs)
goto leave;
- }
/*
* preallocate up front
@@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
* find a node to migrate the lockres to
*/
- mlog(0, "picking a migration node\n");
spin_lock(&dlm->spinlock);
/* pick a new node */
if (!test_bit(target, dlm->domain_map) ||
target >= O2NM_MAX_NODES) {
target = dlm_pick_migration_target(dlm, res);
}
- mlog(0, "node %u chosen for migration\n", target);
+ mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
+ namelen, name, target);
if (target >= O2NM_MAX_NODES ||
!test_bit(target, dlm->domain_map)) {
@@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
int ret;
int lock_dropped = 0;
- int numlocks;
+ int numlocks, hasrefs;
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
@@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
}
/* No need to migrate a lockres having no locks */
- ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
- if (ret >= 0 && numlocks == 0) {
+ ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
+ if (ret >= 0 && numlocks == 0 && !hasrefs) {
spin_unlock(&res->spinlock);
goto leave;
}
@@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
}
queue++;
}
+
+ nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (nodenum < O2NM_MAX_NODES) {
+ spin_unlock(&res->spinlock);
+ return nodenum;
+ }
spin_unlock(&res->spinlock);
mlog(0, "have not found a suitable target yet! checking domain map\n");
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 77b4c04a280..f6cba566429 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2241,11 +2241,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
mutex_lock(&inode->i_mutex);
+ ocfs2_iocb_clear_sem_locked(iocb);
+
relock:
/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
if (direct_io) {
down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
+ /* communicate with ocfs2_dio_end_io */
+ ocfs2_iocb_set_sem_locked(iocb);
}
/*
@@ -2382,8 +2386,10 @@ out:
ocfs2_rw_unlock(inode, rw_level);
out_sems:
- if (have_alloc_sem)
+ if (have_alloc_sem) {
up_read(&inode->i_alloc_sem);
+ ocfs2_iocb_clear_sem_locked(iocb);
+ }
mutex_unlock(&inode->i_mutex);
@@ -2527,6 +2533,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
goto bail;
}
+ ocfs2_iocb_clear_sem_locked(iocb);
+
/*
* buffered reads protect themselves in ->readpage(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
@@ -2534,6 +2542,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
if (filp->f_flags & O_DIRECT) {
down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
+ ocfs2_iocb_set_sem_locked(iocb);
ret = ocfs2_rw_lock(inode, 0);
if (ret < 0) {
@@ -2575,8 +2584,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
}
bail:
- if (have_alloc_sem)
+ if (have_alloc_sem) {
up_read(&inode->i_alloc_sem);
+ ocfs2_iocb_clear_sem_locked(iocb);
+ }
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
mlog_exit(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d8408217e3b..70dd3b1798f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,9 @@ struct ocfs2_lock_res {
char l_name[OCFS2_LOCK_ID_MAX_LEN];
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- unsigned char l_level;
+ signed char l_level;
+ signed char l_requested;
+ signed char l_blocking;
/* Data packed - type enum ocfs2_lock_type */
unsigned char l_type;
@@ -169,8 +171,6 @@ struct ocfs2_lock_res {
unsigned char l_action;
/* Data packed - enum type ocfs2_unlock_action */
unsigned char l_unlock_action;
- unsigned char l_requested;
- unsigned char l_blocking;
unsigned int l_pending_gen;
spinlock_t l_lock;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c2e4f8222e2..bf2e7764920 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -350,7 +350,7 @@ enum {
#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
NUM_SYSTEM_INODES
};
-#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
#define NUM_LOCAL_SYSTEM_INODES \
(NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 252e7c82f92..a5ebe421195 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -190,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
return c;
}
- return c;
+ return NULL;
}
/*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f02c0ef3157..cfeab7ce369 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,7 +41,6 @@
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/quotaops.h>
-#include <linux/smp_lock.h>
#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ddb1f41376e..911e61f348f 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -418,7 +418,7 @@ out_no_root:
static struct dentry *openprom_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_single(fs_type, flags, data, openprom_fill_super)
+ return mount_single(fs_type, flags, data, openprom_fill_super);
}
static struct file_system_type openprom_fs_type = {
diff --git a/fs/pipe.c b/fs/pipe.c
index a8012a95572..04629f36e39 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1199,12 +1199,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
return ret;
}
+/*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+struct pipe_inode_info *get_pipe_info(struct file *file)
+{
+ struct inode *i = file->f_path.dentry->d_inode;
+
+ return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
+}
+
long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe;
long ret;
- pipe = file->f_path.dentry->d_inode->i_pipe;
+ pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3d02ca461e..08cba2c3b61 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+ proc_sched_autogroup_show_task(p, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct task_struct *p;
+ char buffer[PROC_NUMBUF];
+ long nice;
+ int err;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+
+ err = strict_strtol(strstrip(buffer), 0, &nice);
+ if (err)
+ return -EINVAL;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ err = nice;
+ err = proc_sched_autogroup_set_nice(p, &err);
+ if (err)
+ count = err;
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = single_open(filp, sched_autogroup_show, NULL);
+ if (!ret) {
+ struct seq_file *m = filp->private_data;
+
+ m->private = inode;
+ }
+ return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+ .open = sched_autogroup_open,
+ .read = seq_read,
+ .write = sched_autogroup_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
@@ -1574,7 +1650,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
if (!tmp)
return -ENOMEM;
- pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
+ pathname = d_path(path, tmp, PAGE_SIZE);
len = PTR_ERR(pathname);
if (IS_ERR(pathname))
goto out;
@@ -2733,6 +2809,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+ REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
INF("syscall", S_IRUSR, proc_pid_syscall),
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9c2b5f48487..3ddb6068177 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -16,7 +16,6 @@
#include <linux/limits.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index da6b01d70f0..c126c83b9a4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -706,6 +706,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* skip over unmapped regions.
*/
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
+#define PAGEMAP_WALK_MASK (PMD_MASK)
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -776,7 +777,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
unsigned long end;
pm.pos = 0;
- end = start_vaddr + PAGEMAP_WALK_SIZE;
+ end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
/* overflow ? */
if (end < start_vaddr || end > end_vaddr)
end = end_vaddr;
diff --git a/fs/read_write.c b/fs/read_write.c
index 431a0ed610c..5d431bacbea 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
-#include <linux/smp_lock.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/module.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 41656d40dc5..0bae036831e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -8,7 +8,6 @@
#include <linux/reiserfs_acl.h>
#include <linux/reiserfs_xattr.h>
#include <linux/exportfs.h>
-#include <linux/smp_lock.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index adf22b485ce..79265fdc317 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,7 +9,6 @@
#include <linux/time.h>
#include <asm/uaccess.h>
#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
#include <linux/compat.h>
/*
@@ -184,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
return 0;
}
- /* we need to make sure nobody is changing the file size beneath
- ** us
- */
- reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
depth = reiserfs_write_lock_once(inode->i_sb);
+ /* we need to make sure nobody is changing the file size beneath us */
+ reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
+
write_from = inode->i_size & (blocksize - 1);
/* if we are on a block boundary, we are already unpacked. */
if (write_from == 0) {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 076c8b19468..d31bce1a9f9 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -43,7 +43,6 @@
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/workqueue.h>
#include <linux/writeback.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3bf7a6457f4..b243117b875 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,7 +28,6 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/crc32.h>
-#include <linux/smp_lock.h>
struct file_system_type reiserfs_fs_type;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 536d697a8a2..90d2fcb67a3 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode)
struct reiserfs_transaction_handle th;
size_t size = reiserfs_xattr_nblocks(inode,
reiserfs_acl_size(clone->a_count));
- reiserfs_write_lock(inode->i_sb);
+ int depth;
+
+ depth = reiserfs_write_lock_once(inode->i_sb);
error = journal_begin(&th, inode->i_sb, size * 2);
if (!error) {
int error2;
@@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode)
if (error2)
error = error2;
}
- reiserfs_write_unlock(inode->i_sb);
+ reiserfs_write_unlock_once(inode->i_sb, depth);
}
posix_acl_release(clone);
return error;
diff --git a/fs/splice.c b/fs/splice.c
index 8f1dfaecc8f..ce2f02579e3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1311,18 +1311,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
size_t len, unsigned int flags);
-/*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
- */
-static inline struct pipe_inode_info *pipe_info(struct inode *inode)
-{
- if (S_ISFIFO(inode->i_mode))
- return inode->i_pipe;
-
- return NULL;
-}
/*
* Determine where to splice to/from.
@@ -1336,8 +1324,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
loff_t offset, *off;
long ret;
- ipipe = pipe_info(in->f_path.dentry->d_inode);
- opipe = pipe_info(out->f_path.dentry->d_inode);
+ ipipe = get_pipe_info(in);
+ opipe = get_pipe_info(out);
if (ipipe && opipe) {
if (off_in || off_out)
@@ -1555,7 +1543,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
int error;
long ret;
- pipe = pipe_info(file->f_path.dentry->d_inode);
+ pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
@@ -1642,7 +1630,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
};
long ret;
- pipe = pipe_info(file->f_path.dentry->d_inode);
+ pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
@@ -2022,8 +2010,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
static long do_tee(struct file *in, struct file *out, size_t len,
unsigned int flags)
{
- struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
- struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
+ struct pipe_inode_info *ipipe = get_pipe_info(in);
+ struct pipe_inode_info *opipe = get_pipe_info(out);
int ret = -EINVAL;
/*
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c9af48fffcd..691f61223ed 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -934,7 +934,6 @@ xfs_aops_discard_page(
struct xfs_inode *ip = XFS_I(inode);
struct buffer_head *bh, *head;
loff_t offset = page_offset(page);
- ssize_t len = 1 << inode->i_blkbits;
if (!xfs_is_delayed_page(page, IO_DELAY))
goto out_invalidate;
@@ -949,58 +948,14 @@ xfs_aops_discard_page(
xfs_ilock(ip, XFS_ILOCK_EXCL);
bh = head = page_buffers(page);
do {
- int done;
- xfs_fileoff_t offset_fsb;
- xfs_bmbt_irec_t imap;
- int nimaps = 1;
int error;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
+ xfs_fileoff_t start_fsb;
if (!buffer_delay(bh))
goto next_buffer;
- offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
-
- /*
- * Map the range first and check that it is a delalloc extent
- * before trying to unmap the range. Otherwise we will be
- * trying to remove a real extent (which requires a
- * transaction) or a hole, which is probably a bad idea...
- */
- error = xfs_bmapi(NULL, ip, offset_fsb, 1,
- XFS_BMAPI_ENTIRE, NULL, 0, &imap,
- &nimaps, NULL);
-
- if (error) {
- /* something screwed, just bail */
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
- "page discard failed delalloc mapping lookup.");
- }
- break;
- }
- if (!nimaps) {
- /* nothing there */
- goto next_buffer;
- }
- if (imap.br_startblock != DELAYSTARTBLOCK) {
- /* been converted, ignore */
- goto next_buffer;
- }
- WARN_ON(imap.br_blockcount == 0);
-
- /*
- * Note: while we initialise the firstblock/flist pair, they
- * should never be used because blocks should never be
- * allocated or freed for a delalloc extent and hence we need
- * don't cancel or finish them after the xfs_bunmapi() call.
- */
- xfs_bmap_init(&flist, &firstblock);
- error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
- &flist, &done);
-
- ASSERT(!flist.xbf_count && !flist.xbf_first);
+ start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +965,7 @@ xfs_aops_discard_page(
break;
}
next_buffer:
- offset += len;
+ offset += 1 << inode->i_blkbits;
} while ((bh = bh->b_this_page) != head);
@@ -1111,11 +1066,12 @@ xfs_vm_writepage(
uptodate = 0;
/*
- * A hole may still be marked uptodate because discard_buffer
- * leaves the flag set.
+ * set_page_dirty dirties all buffers in a page, independent
+ * of their state. The dirty state however is entirely
+ * meaningless for holes (!mapped && uptodate), so skip
+ * buffers covering holes here.
*/
if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
- ASSERT(!buffer_dirty(bh));
imap_valid = 0;
continue;
}
@@ -1504,11 +1460,42 @@ xfs_vm_write_failed(
struct inode *inode = mapping->host;
if (to > inode->i_size) {
- struct iattr ia = {
- .ia_valid = ATTR_SIZE | ATTR_FORCE,
- .ia_size = inode->i_size,
- };
- xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+ /*
+ * punch out the delalloc blocks we have already allocated. We
+ * don't call xfs_setattr() to do this as we may be in the
+ * middle of a multi-iovec write and so the vfs inode->i_size
+ * will not match the xfs ip->i_size and so it will zero too
+ * much. Hence we jus truncate the page cache to zero what is
+ * necessary and punch the delalloc blocks directly.
+ */
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t end_fsb;
+ int error;
+
+ truncate_pagecache(inode, to, inode->i_size);
+
+ /*
+ * Check if there are any blocks that are outside of i_size
+ * that need to be trimmed back.
+ */
+ start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+ if (end_fsb <= start_fsb)
+ return;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+ end_fsb - start_fsb);
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+ "xfs_vm_write_failed: unable to clean up ino %lld",
+ ip->i_ino);
+ }
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
}
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 63fd2c07cb5..4c5deb6e9e3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -488,29 +488,16 @@ found:
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
- /* Attempt to get the semaphore without sleeping,
- * if this does not work then we need to drop the
- * spinlock and do a hard attempt on the semaphore.
- */
- if (down_trylock(&bp->b_sema)) {
+ if (xfs_buf_cond_lock(bp)) {
+ /* failed, so wait for the lock if requested. */
if (!(flags & XBF_TRYLOCK)) {
- /* wait for buffer ownership */
xfs_buf_lock(bp);
XFS_STATS_INC(xb_get_locked_waited);
} else {
- /* We asked for a trylock and failed, no need
- * to look at file offset and length here, we
- * know that this buffer at least overlaps our
- * buffer and is locked, therefore our buffer
- * either does not exist, or is this buffer.
- */
xfs_buf_rele(bp);
XFS_STATS_INC(xb_busy_locked);
return NULL;
}
- } else {
- /* trylock worked */
- XB_SET_OWNER(bp);
}
if (bp->b_flags & XBF_STALE) {
@@ -876,10 +863,18 @@ xfs_buf_rele(
*/
/*
- * Locks a buffer object, if it is not already locked.
- * Note that this in no way locks the underlying pages, so it is only
- * useful for synchronizing concurrent use of buffer objects, not for
- * synchronizing independent access to the underlying pages.
+ * Locks a buffer object, if it is not already locked. Note that this in
+ * no way locks the underlying pages, so it is only useful for
+ * synchronizing concurrent use of buffer objects, not for synchronizing
+ * independent access to the underlying pages.
+ *
+ * If we come across a stale, pinned, locked buffer, we know that we are
+ * being asked to lock a buffer that has been reallocated. Because it is
+ * pinned, we know that the log has not been pushed to disk and hence it
+ * will still be locked. Rather than continuing to have trylock attempts
+ * fail until someone else pushes the log, push it ourselves before
+ * returning. This means that the xfsaild will not get stuck trying
+ * to push on stale inode buffers.
*/
int
xfs_buf_cond_lock(
@@ -890,6 +885,8 @@ xfs_buf_cond_lock(
locked = down_trylock(&bp->b_sema) == 0;
if (locked)
XB_SET_OWNER(bp);
+ else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+ xfs_log_force(bp->b_target->bt_mount, 0);
trace_xfs_buf_cond_lock(bp, _RET_IP_);
return locked ? 0 : -EBUSY;
@@ -1781,7 +1778,6 @@ xfs_buf_delwri_split(
INIT_LIST_HEAD(list);
spin_lock(dwlk);
list_for_each_entry_safe(bp, n, dwq, b_list) {
- trace_xfs_buf_delwri_split(bp, _RET_IP_);
ASSERT(bp->b_flags & XBF_DELWRI);
if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1795,6 +1791,7 @@ xfs_buf_delwri_split(
_XBF_RUN_QUEUES);
bp->b_flags |= XBF_WRITE;
list_move_tail(&bp->b_list, list);
+ trace_xfs_buf_delwri_split(bp, _RET_IP_);
} else
skipped++;
}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2ea238f6d38..ad442d9e392 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -416,7 +416,7 @@ xfs_attrlist_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+ kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
if (!kbuf)
goto out_dput;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 96107efc0c6..94d5fd6a297 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -762,7 +762,8 @@ xfs_setup_inode(
inode->i_state = I_NEW;
inode_sb_list_add(inode);
- insert_inode_hash(inode);
+ /* make the inode look hashed for the writeback code */
+ hlist_add_fake(&inode->i_hash);
inode->i_mode = ip->i_d.di_mode;
inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae..064f964d4f3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -353,9 +353,6 @@ xfs_parseargs(
mp->m_qflags &= ~XFS_OQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
mp->m_flags |= XFS_MOUNT_DELAYLOG;
- cmn_err(CE_WARN,
- "Enabling EXPERIMENTAL delayed logging feature "
- "- use at your own risk.\n");
} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
} else if (!strcmp(this_char, "ihashsize")) {
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 37d33254981..afb0d7cfad1 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -853,6 +853,7 @@ restart:
if (trylock) {
if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
skipped++;
+ xfs_perag_put(pag);
continue;
}
first_index = pag->pag_ici_reclaim_cursor;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8abd12e32e1..4111cd3966c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5471,8 +5471,13 @@ xfs_getbmap(
if (error)
goto out_unlock_iolock;
}
-
- ASSERT(ip->i_delayed_blks == 0);
+ /*
+ * even after flushing the inode, there can still be delalloc
+ * blocks on the inode beyond EOF due to speculative
+ * preallocation. These are not removed until the release
+ * function is called or the inode is inactivated. Hence we
+ * cannot assert here that ip->i_delayed_blks == 0.
+ */
}
lock = xfs_ilock_map_shared(ip);
@@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves(
*count += xfs_bmbt_disk_get_blockcount(frp);
}
}
+
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+ struct xfs_inode *ip,
+ xfs_fileoff_t start_fsb,
+ xfs_fileoff_t length)
+{
+ xfs_fileoff_t remaining = length;
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ do {
+ int done;
+ xfs_bmbt_irec_t imap;
+ int nimaps = 1;
+ xfs_fsblock_t firstblock;
+ xfs_bmap_free_t flist;
+
+ /*
+ * Map the range first and check that it is a delalloc extent
+ * before trying to unmap the range. Otherwise we will be
+ * trying to remove a real extent (which requires a
+ * transaction) or a hole, which is probably a bad idea...
+ */
+ error = xfs_bmapi(NULL, ip, start_fsb, 1,
+ XFS_BMAPI_ENTIRE, NULL, 0, &imap,
+ &nimaps, NULL);
+
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+ "Failed delalloc mapping lookup ino %lld fsb %lld.",
+ ip->i_ino, start_fsb);
+ }
+ break;
+ }
+ if (!nimaps) {
+ /* nothing there */
+ goto next_block;
+ }
+ if (imap.br_startblock != DELAYSTARTBLOCK) {
+ /* been converted, ignore */
+ goto next_block;
+ }
+ WARN_ON(imap.br_blockcount == 0);
+
+ /*
+ * Note: while we initialise the firstblock/flist pair, they
+ * should never be used because blocks should never be
+ * allocated or freed for a delalloc extent and hence we need
+ * don't cancel or finish them after the xfs_bunmapi() call.
+ */
+ xfs_bmap_init(&flist, &firstblock);
+ error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+ &flist, &done);
+ if (error)
+ break;
+
+ ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+ start_fsb++;
+ remaining--;
+ } while(remaining > 0);
+
+ return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 71ec9b6ecdf..3651191daea 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
int whichfork,
int *count);
+int
+xfs_bmap_punch_delalloc_range(
+ struct xfs_inode *ip,
+ xfs_fileoff_t start_fsb,
+ xfs_fileoff_t length);
#endif /* __KERNEL__ */
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a2..e60490bc00a 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
ip->i_d.di_format = tip->i_d.di_format;
tip->i_d.di_format = tmp;
+ /*
+ * The extents in the source inode could still contain speculative
+ * preallocation beyond EOF (e.g. the file is open but not modified
+ * while defrag is in progress). In that case, we need to copy over the
+ * number of delalloc blocks the data fork in the source inode is
+ * tracking beyond EOF so that when the fork is truncated away when the
+ * temporary inode is unlinked we don't underrun the i_delayed_blks
+ * counter on that inode.
+ */
+ ASSERT(tip->i_delayed_blks == 0);
+ tip->i_delayed_blks = ip->i_delayed_blks;
+ ip->i_delayed_blks = 0;
+
ilf_fields = XFS_ILOG_CORE;
switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed999026766..c78cc6a3d87 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
int xfs_etest[XFS_NUM_INJECT_ERROR];
int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int xfs_error_test_active;
int
xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
len = strlen(mp->m_fsname);
xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
strcpy(xfs_etest_fsname[i], mp->m_fsname);
+ xfs_error_test_active++;
return 0;
}
}
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
xfs_etest_fsid[i] = 0LL;
kmem_free(xfs_etest_fsname[i]);
xfs_etest_fsname[i] = NULL;
+ xfs_error_test_active--;
}
}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb8..f338847f80b 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
#ifdef DEBUG
+extern int xfs_error_test_active;
extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
#define XFS_NUM_INJECT_ERROR 10
#define XFS_TEST_ERROR(expr, mp, tag, rf) \
- ((expr) || \
+ ((expr) || (xfs_error_test_active && \
xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
- (rf)))
+ (rf))))
extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce569..9124425b7f2 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
* If the file's parent directory is known, take its iolock in exclusive
* mode to prevent two sibling files from racing each other to migrate
* themselves and their parent to different AGs.
+ *
+ * Note that we lock the parent directory iolock inside the child
+ * iolock here. That's fine as we never hold both parent and child
+ * iolock in any other place. This is different from the ilock,
+ * which requires locking of the child after the parent for namespace
+ * operations.
*/
if (pip)
- xfs_ilock(pip, XFS_IOLOCK_EXCL);
+ xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
/*
* A new AG needs to be found for the file. If the file's parent
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c7ac020705d..7c8d30c453c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
}
/*
- * This is called to find out where the oldest active copy of the
- * inode log item in the on disk log resides now that the last log
- * write of it completed at the given lsn. Since we always re-log
- * all dirty data in an inode, the latest copy in the on disk log
- * is the only one that matters. Therefore, simply return the
- * given lsn.
+ * This is called to find out where the oldest active copy of the inode log
+ * item in the on disk log resides now that the last log write of it completed
+ * at the given lsn. Since we always re-log all dirty data in an inode, the
+ * latest copy in the on disk log is the only one that matters. Therefore,
+ * simply return the given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completions before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, return a lower LSN than the one passed in so that the
+ * transaction committed code will not move the inode forward in the AIL but
+ * will still unpin it properly.
*/
STATIC xfs_lsn_t
xfs_inode_item_committed(
struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+
+ if (xfs_iflags_test(ip, XFS_ISTALE))
+ return lsn - 1;
return lsn;
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b1498ab5a39..19e9dfa1c25 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -275,6 +275,7 @@ xfs_free_perag(
pag = radix_tree_delete(&mp->m_perag_tree, agno);
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
+ ASSERT(atomic_read(&pag->pag_ref) == 0);
call_rcu(&pag->rcu_head, __xfs_free_perag);
}
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd..9bb6eda4cd2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
#define xfs_trans_apply_dquot_deltas(tp)
#define xfs_trans_unreserve_and_mod_dquots(tp)
-#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0)
-#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
+ struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+ return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+ struct xfs_mount *mp, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+ return 0;
+}
#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
#define xfs_qm_vop_rename_dqattach(it) (0)
#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
#define xfs_qm_dqdetach(ip)
#define xfs_qm_dqrele(d)
#define xfs_qm_statvfs(ip, s)
-#define xfs_qm_sync(mp, fl) (0)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+ return 0;
+}
#define xfs_qm_newmount(mp, a, b) (0)
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
-#define xfs_qm_unmount_quotas(mp) (0)
+#define xfs_qm_unmount_quotas(mp)
#endif /* CONFIG_XFS_QUOTA */
#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a..77a59891734 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
* it and some incremental backup programs won't work without it.
*/
xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
/*
* Adjust the link count on src_dp. This is necessary when