summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bio.c7
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/ctree.h8
-rw-r--r--fs/btrfs/delayed-inode.c58
-rw-r--r--fs/btrfs/disk-io.c189
-rw-r--r--fs/btrfs/extent-tree.c295
-rw-r--r--fs/btrfs/extent_io.c60
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/free-space-cache.c82
-rw-r--r--fs/btrfs/inode-map.c28
-rw-r--r--fs/btrfs/inode.c92
-rw-r--r--fs/btrfs/ioctl.c17
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c71
-rw-r--r--fs/btrfs/super.c93
-rw-r--r--fs/btrfs/transaction.c12
-rw-r--r--fs/btrfs/volumes.c7
-rw-r--r--fs/btrfs/volumes.h6
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/inode.c9
-rw-r--r--fs/ceph/super.c6
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/file.c26
-rw-r--r--fs/cifs/readdir.c10
-rw-r--r--fs/cifs/smbencrypt.c6
-rw-r--r--fs/dcache.c82
-rw-r--r--fs/ecryptfs/crypto.c26
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c23
-rw-r--r--fs/ecryptfs/inode.c52
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/inode.c3
-rw-r--r--fs/ext4/super.c6
-rw-r--r--fs/hfs/trans.c2
-rw-r--r--fs/minix/bitmap.c55
-rw-r--r--fs/minix/inode.c25
-rw-r--r--fs/minix/minix.h11
-rw-r--r--fs/namespace.c52
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/file.c91
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4proc.c4
-rw-r--r--fs/nfs/pnfs.c26
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c14
-rw-r--r--fs/nfs/super.c37
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c69
-rw-r--r--fs/ocfs2/aops.h14
-rw-r--r--fs/ocfs2/cluster/heartbeat.c194
-rw-r--r--fs/ocfs2/cluster/netdebug.c102
-rw-r--r--fs/ocfs2/cluster/tcp.c138
-rw-r--r--fs/ocfs2/cluster/tcp.h2
-rw-r--r--fs/ocfs2/dir.c3
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h56
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmlock.c54
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c175
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c164
-rw-r--r--fs/ocfs2/dlm/dlmthread.c16
-rw-r--r--fs/ocfs2/dlmglue.c21
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h2
-rw-r--r--fs/ocfs2/file.c96
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/inode.h3
-rw-r--r--fs/ocfs2/ioctl.c11
-rw-r--r--fs/ocfs2/journal.c23
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/mmap.c53
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/ocfs2.h51
-rw-r--r--fs/ocfs2/quota_local.c23
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/stack_o2cb.c71
-rw-r--r--fs/ocfs2/super.c25
-rw-r--r--fs/ocfs2/xattr.c10
-rw-r--r--fs/proc/base.c146
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/pstore/platform.c13
-rw-r--r--fs/seq_file.c6
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_leaf.c64
-rw-r--r--fs/xfs/xfs_bmap.c20
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_dquot_item.c6
-rw-r--r--fs/xfs/xfs_export.c8
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_inode.c21
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_log.c350
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_qm.c3
-rw-r--r--fs/xfs/xfs_sync.c11
-rw-r--r--fs/xfs/xfs_trace.h12
-rw-r--r--fs/xfs/xfs_trans.h6
-rw-r--r--fs/xfs/xfs_vnodeops.c14
104 files changed, 2404 insertions, 1470 deletions
diff --git a/fs/bio.c b/fs/bio.c
index 41c93c72224..b1fe82cf88c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -337,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio)
* RETURNS:
* Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
+struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
{
struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
@@ -365,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio)
* %__GFP_WAIT, the allocation is guaranteed to succeed.
*
**/
-struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
{
struct bio *bio;
@@ -696,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd)
kfree(bmd);
}
-static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
+static struct bio_map_data *bio_alloc_map_data(int nr_segs,
+ unsigned int iov_count,
gfp_t gfp_mask)
{
struct bio_map_data *bmd;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8855aad3929..22c64fff1bd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
return PTR_ERR(fspath);
if (fspath > fspath_min) {
- ipath->fspath->val[i] = (u64)fspath;
+ ipath->fspath->val[i] = (u64)(unsigned long)fspath;
++ipath->fspath->elem_cnt;
ipath->fspath->bytes_left = fspath - fspath_min;
} else {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5a5d325a393..634608d2a6d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,14 +147,12 @@ struct btrfs_inode {
* the btrfs file release call will add this inode to the
* ordered operations list so that we make sure to flush out any
* new data the application may have written before commit.
- *
- * yes, its silly to have a single bitflag, but we might grow more
- * of these.
*/
unsigned ordered_data_close:1;
unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
unsigned in_defrag:1;
+ unsigned delalloc_meta_reserved:1;
/*
* always compress this one file
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0fe615e4ea3..dede441bdee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
+ /* ensure we can see the force_cow */
+ smp_rmb();
+
+ /*
+ * We do not need to cow a block if
+ * 1) this block is not created or changed in this transaction;
+ * 2) this block does not belong to TREE_RELOC tree;
+ * 3) the root is not forced COW.
+ *
+ * What is forced COW:
+ * when we create snapshot during commiting the transaction,
+ * after we've finished coping src root, we must COW the shared
+ * block to ensure the metadata consistency.
+ */
if (btrfs_header_generation(buf) == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
!(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+ !root->force_cow)
return 0;
return 1;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9ba59ff929..50634abef9b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -848,7 +848,8 @@ struct btrfs_free_cluster {
enum btrfs_caching_type {
BTRFS_CACHE_NO = 0,
BTRFS_CACHE_STARTED = 1,
- BTRFS_CACHE_FINISHED = 2,
+ BTRFS_CACHE_FAST = 2,
+ BTRFS_CACHE_FINISHED = 3,
};
enum btrfs_disk_cache_state {
@@ -1271,6 +1272,8 @@ struct btrfs_root {
* for stat. It may be used for more later
*/
dev_t anon_dev;
+
+ int force_cow;
};
struct btrfs_ioctl_defrag_range_args {
@@ -2366,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 min_reserved);
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 3a1b939c9ae..5b163572e0c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct inode *inode,
struct btrfs_delayed_node *node)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
int ret;
+ int release = false;
src_rsv = trans->block_rsv;
dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -652,12 +654,65 @@ static int btrfs_delayed_inode_reserve_metadata(
if (!ret)
node->bytes_reserved = num_bytes;
return ret;
+ } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (BTRFS_I(inode)->delalloc_meta_reserved) {
+ BTRFS_I(inode)->delalloc_meta_reserved = 0;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ release = true;
+ goto migrate;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ /* Ok we didn't have space pre-reserved. This shouldn't happen
+ * too often but it can happen if we do delalloc to an existing
+ * inode which gets dirtied because of the time update, and then
+ * isn't touched again until after the transaction commits and
+ * then we try to write out the data. First try to be nice and
+ * reserve something strictly for us. If not be a pain and try
+ * to steal from the delalloc block rsv.
+ */
+ ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ if (!ret)
+ goto out;
+
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ if (!ret)
+ goto out;
+
+ /*
+ * Ok this is a problem, let's just steal from the global rsv
+ * since this really shouldn't happen that often.
+ */
+ WARN_ON(1);
+ ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+ dst_rsv, num_bytes);
+ goto out;
}
+migrate:
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+
+out:
+ /*
+ * Migrate only takes a reservation, it doesn't touch the size of the
+ * block_rsv. This is to simplify people who don't normally have things
+ * migrated from their block rsv. If they go to release their
+ * reservation, that will decrease the size as well, so if migrate
+ * reduced size we'd end up with a negative size. But for the
+ * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+ * but we could in fact do this reserve/migrate dance several times
+ * between the time we did the original reservation and we'd clean it
+ * up. So to take care of this, release the space for the meta
+ * reservation here. I think it may be time for a documentation page on
+ * how block rsvs. work.
+ */
if (!ret)
node->bytes_reserved = num_bytes;
+ if (release)
+ btrfs_block_rsv_release(root, src_rsv, num_bytes);
+
return ret;
}
@@ -1708,7 +1763,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
goto release_node;
}
- ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
+ delayed_node);
if (ret)
goto release_node;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 102c176fc29..632f8f3cc9d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
static int btree_io_failed_hook(struct bio *failed_bio,
struct page *page, u64 start, u64 end,
- u64 mirror_num, struct extent_state *state)
+ int mirror_num, struct extent_state *state)
{
struct extent_io_tree *tree;
unsigned long len;
@@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
u64 features;
struct btrfs_key location;
struct buffer_head *bh;
- struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
- struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
+ struct btrfs_super_block *disk_super;
struct btrfs_root *tree_root = btrfs_sb(sb);
- struct btrfs_fs_info *fs_info = NULL;
- struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
- struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_root *extent_root;
+ struct btrfs_root *csum_root;
+ struct btrfs_root *chunk_root;
+ struct btrfs_root *dev_root;
struct btrfs_root *log_tree_root;
-
int ret;
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
- struct btrfs_super_block *disk_super;
+ extent_root = fs_info->extent_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ csum_root = fs_info->csum_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ chunk_root = fs_info->chunk_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ dev_root = fs_info->dev_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- if (!extent_root || !tree_root || !tree_root->fs_info ||
- !chunk_root || !dev_root || !csum_root) {
+ if (!extent_root || !csum_root || !chunk_root || !dev_root) {
err = -ENOMEM;
goto fail;
}
- fs_info = tree_root->fs_info;
ret = init_srcu_struct(&fs_info->subvol_srcu);
if (ret) {
@@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
- fs_info->tree_root = tree_root;
- fs_info->extent_root = extent_root;
- fs_info->csum_root = csum_root;
- fs_info->chunk_root = chunk_root;
- fs_info->dev_root = dev_root;
- fs_info->fs_devices = fs_devices;
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
btrfs_mapping_init(&fs_info->mapping_tree);
@@ -2465,21 +2460,20 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->caching_workers);
fail_alloc:
fail_iput:
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
-
- btrfs_close_devices(fs_info->fs_devices);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
fail_bdi:
bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
+ btrfs_close_devices(fs_info->fs_devices);
free_fs_info(fs_info);
return ERR_PTR(err);
recovery_tree_root:
-
if (!btrfs_test_opt(tree_root, RECOVERY))
goto fail_tree_roots;
@@ -2579,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device,
int errors = 0;
u32 crc;
u64 bytenr;
- int last_barrier = 0;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
- /* make sure only the last submit_bh does a barrier */
- if (do_barriers) {
- for (i = 0; i < max_mirrors; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- device->total_bytes)
- break;
- last_barrier = i;
- }
- }
-
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2640,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device,
bh->b_end_io = btrfs_end_buffer_write_sync;
}
- if (i == last_barrier && do_barriers)
- ret = submit_bh(WRITE_FLUSH_FUA, bh);
- else
- ret = submit_bh(WRITE_SYNC, bh);
-
+ /*
+ * we fua the first super. The others we allow
+ * to go down lazy.
+ */
+ ret = submit_bh(WRITE_FUA, bh);
if (ret)
errors++;
}
return errors < i ? 0 : -1;
}
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ }
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * trigger flushes for one the devices. If you pass wait == 0, the flushes are
+ * sent down. With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+ struct bio *bio;
+ int ret = 0;
+
+ if (device->nobarriers)
+ return 0;
+
+ if (wait) {
+ bio = device->flush_bio;
+ if (!bio)
+ return 0;
+
+ wait_for_completion(&device->flush_wait);
+
+ if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+ printk("btrfs: disabling barriers on dev %s\n",
+ device->name);
+ device->nobarriers = 1;
+ }
+ if (!bio_flagged(bio, BIO_UPTODATE)) {
+ ret = -EIO;
+ }
+
+ /* drop the reference from the wait == 0 run */
+ bio_put(bio);
+ device->flush_bio = NULL;
+
+ return ret;
+ }
+
+ /*
+ * one reference for us, and we leave it for the
+ * caller
+ */
+ device->flush_bio = NULL;;
+ bio = bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_end_io = btrfs_end_empty_barrier;
+ bio->bi_bdev = device->bdev;
+ init_completion(&device->flush_wait);
+ bio->bi_private = &device->flush_wait;
+ device->flush_bio = bio;
+
+ bio_get(bio);
+ submit_bio(WRITE_FLUSH, bio);
+
+ return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ int errors = 0;
+ int ret;
+
+ /* send down all the barriers */
+ head = &info->fs_devices->devices;
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (!dev->bdev) {
+ errors++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_flush(dev, 0);
+ if (ret)
+ errors++;
+ }
+
+ /* wait for all the barriers */
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (!dev->bdev) {
+ errors++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_flush(dev, 1);
+ if (ret)
+ errors++;
+ }
+ if (errors)
+ return -EIO;
+ return 0;
+}
+
int write_all_supers(struct btrfs_root *root, int max_mirrors)
{
struct list_head *head;
@@ -2672,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
+
+ if (do_barriers)
+ barrier_all_devices(root->fs_info);
+
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
total_errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9879bd47463..2ad813674d7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
struct btrfs_root *root,
int load_cache_only)
{
+ DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl;
int ret = 0;
- smp_mb();
- if (cache->cached != BTRFS_CACHE_NO)
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+ BUG_ON(!caching_ctl);
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->key.objectid;
+ atomic_set(&caching_ctl->count, 1);
+ caching_ctl->work.func = caching_thread;
+
+ spin_lock(&cache->lock);
+ /*
+ * This should be a rare occasion, but this could happen I think in the
+ * case where one thread starts to load the space cache info, and then
+ * some other thread starts a transaction commit which tries to do an
+ * allocation while the other thread is still loading the space cache
+ * info. The previous loop should have kept us from choosing this block
+ * group, but if we've moved to the state where we will wait on caching
+ * block groups we need to first check if we're doing a fast load here,
+ * so we can wait for it to finish, otherwise we could end up allocating
+ * from a block group who's cache gets evicted for one reason or
+ * another.
+ */
+ while (cache->cached == BTRFS_CACHE_FAST) {
+ struct btrfs_caching_control *ctl;
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&cache->lock);
+
+ schedule();
+
+ finish_wait(&ctl->wait, &wait);
+ put_caching_control(ctl);
+ spin_lock(&cache->lock);
+ }
+
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ kfree(caching_ctl);
return 0;
+ }
+ WARN_ON(cache->caching_ctl);
+ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_FAST;
+ spin_unlock(&cache->lock);
/*
* We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
if (trans && (!trans->transaction->in_commit) &&
(root && root != root->fs_info->tree_root) &&
btrfs_test_opt(root, SPACE_CACHE)) {
- spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
- return 0;
- }
- cache->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&cache->lock);
-
ret = load_free_space_cache(fs_info, cache);
spin_lock(&cache->lock);
if (ret == 1) {
+ cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_FINISHED;
cache->last_byte_to_unpin = (u64)-1;
} else {
- cache->cached = BTRFS_CACHE_NO;
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ }
}
spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
if (ret == 1) {
+ put_caching_control(caching_ctl);
free_excluded_extents(fs_info->extent_root, cache);
return 0;
}
+ } else {
+ /*
+ * We are not going to do the fast caching, set cached to the
+ * appropriate value and wakeup any waiters.
+ */
+ spin_lock(&cache->lock);
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ }
+ spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
}
- if (load_cache_only)
- return 0;
-
- caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
- BUG_ON(!caching_ctl);
-
- INIT_LIST_HEAD(&caching_ctl->list);
- mutex_init(&caching_ctl->mutex);
- init_waitqueue_head(&caching_ctl->wait);
- caching_ctl->block_group = cache;
- caching_ctl->progress = cache->key.objectid;
- /* one for caching kthread, one for caching block group list */
- atomic_set(&caching_ctl->count, 2);
- caching_ctl->work.func = caching_thread;
-
- spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
- kfree(caching_ctl);
+ if (load_cache_only) {
+ put_caching_control(caching_ctl);
return 0;
}
- cache->caching_ctl = caching_ctl;
- cache->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&cache->lock);
down_write(&fs_info->extent_commit_sem);
+ atomic_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
up_write(&fs_info->extent_commit_sem);
@@ -3797,16 +3838,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
kfree(rsv);
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
+static inline int __block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int flush)
{
int ret;
if (num_bytes == 0)
return 0;
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 1);
return 0;
@@ -3815,22 +3856,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
return ret;
}
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ return __block_rsv_add(root, block_rsv, num_bytes, 1);
+}
+
int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
{
- int ret;
-
- if (num_bytes == 0)
- return 0;
-
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 1);
- return 0;
- }
-
- return ret;
+ return __block_rsv_add(root, block_rsv, num_bytes, 0);
}
int btrfs_block_rsv_check(struct btrfs_root *root,
@@ -3851,9 +3888,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
+static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved, int flush)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -3872,7 +3909,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
if (!ret)
return 0;
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 0);
return 0;
@@ -3881,6 +3918,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved)
+{
+ return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
+}
+
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved)
+{
+ return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
+}
+
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes)
@@ -4064,23 +4115,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
*/
static unsigned drop_outstanding_extent(struct inode *inode)
{
+ unsigned drop_inode_space = 0;
unsigned dropped_extents = 0;
BUG_ON(!BTRFS_I(inode)->outstanding_extents);
BTRFS_I(inode)->outstanding_extents--;
+ if (BTRFS_I(inode)->outstanding_extents == 0 &&
+ BTRFS_I(inode)->delalloc_meta_reserved) {
+ drop_inode_space = 1;
+ BTRFS_I(inode)->delalloc_meta_reserved = 0;
+ }
+
/*
* If we have more or the same amount of outsanding extents than we have
* reserved then we need to leave the reserved extents count alone.
*/
if (BTRFS_I(inode)->outstanding_extents >=
BTRFS_I(inode)->reserved_extents)
- return 0;
+ return drop_inode_space;
dropped_extents = BTRFS_I(inode)->reserved_extents -
BTRFS_I(inode)->outstanding_extents;
BTRFS_I(inode)->reserved_extents -= dropped_extents;
- return dropped_extents;
+ return dropped_extents + drop_inode_space;
}
/**
@@ -4166,9 +4224,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
nr_extents = BTRFS_I(inode)->outstanding_extents -
BTRFS_I(inode)->reserved_extents;
BTRFS_I(inode)->reserved_extents += nr_extents;
+ }
- to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+ /*
+ * Add an item to reserve for updating the inode when we complete the
+ * delalloc io.
+ */
+ if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+ nr_extents++;
+ BTRFS_I(inode)->delalloc_meta_reserved = 1;
}
+
+ to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5040,11 +5107,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root = orig_root->fs_info->extent_root;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
+ struct btrfs_block_group_cache *used_block_group;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
int done_chunk_alloc = 0;
struct btrfs_space_info *space_info;
- int last_ptr_loop = 0;
int loop = 0;
int index = 0;
int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -5106,6 +5173,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
ideal_cache:
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
+ used_block_group = block_group;
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -5143,6 +5211,7 @@ search:
u64 offset;
int cached;
+ used_block_group = block_group;
btrfs_get_block_group(block_group);
search_start = block_group->key.objectid;
@@ -5166,13 +5235,15 @@ search:
}
have_block_group:
- if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+ cached = block_group_cache_done(block_group);
+ if (unlikely(!cached)) {
u64 free_percent;
+ found_uncached_bg = true;
ret = cache_block_group(block_group, trans,
orig_root, 1);
if (block_group->cached == BTRFS_CACHE_FINISHED)
- goto have_block_group;
+ goto alloc;
free_percent = btrfs_block_group_used(&block_group->item);
free_percent *= 100;
@@ -5194,7 +5265,6 @@ have_block_group:
orig_root, 0);
BUG_ON(ret);
}
- found_uncached_bg = true;
/*
* If loop is set for cached only, try the next block
@@ -5204,94 +5274,80 @@ have_block_group:
goto loop;
}
- cached = block_group_cache_done(block_group);
- if (unlikely(!cached))
- found_uncached_bg = true;
-
+alloc:
if (unlikely(block_group->ro))
goto loop;
spin_lock(&block_group->free_space_ctl->tree_lock);
if (cached &&
block_group->free_space_ctl->free_space <
- num_bytes + empty_size) {
+ num_bytes + empty_cluster + empty_size) {
spin_unlock(&block_group->free_space_ctl->tree_lock);
goto loop;
}
spin_unlock(&block_group->free_space_ctl->tree_lock);
/*
- * Ok we want to try and use the cluster allocator, so lets look
- * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
- * have tried the cluster allocator plenty of times at this
- * point and not have found anything, so we are likely way too
- * fragmented for the clustering stuff to find anything, so lets
- * just skip it and let the allocator find whatever block it can
- * find
+ * Ok we want to try and use the cluster allocator, so
+ * lets look there
*/
- if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+ if (last_ptr) {
/*
* the refill lock keeps out other
* people trying to start a new cluster
*/
spin_lock(&last_ptr->refill_lock);
- if (last_ptr->block_group &&
- (last_ptr->block_group->ro ||
- !block_group_bits(last_ptr->block_group, data))) {
- offset = 0;
+ used_block_group = last_ptr->block_group;
+ if (used_block_group != block_group &&
+ (!used_block_group ||
+ used_block_group->ro ||
+ !block_group_bits(used_block_group, data))) {
+ used_block_group = block_group;
goto refill_cluster;
}
- offset = btrfs_alloc_from_cluster(block_group, last_ptr,
- num_bytes, search_start);
+ if (used_block_group != block_group)
+ btrfs_get_block_group(used_block_group);
+
+ offset = btrfs_alloc_from_cluster(used_block_group,
+ last_ptr, num_bytes, used_block_group->key.objectid);
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
- spin_lock(&last_ptr->lock);
- /*
- * whoops, this cluster doesn't actually point to
- * this block group. Get a ref on the block
- * group is does point to and try again
- */
- if (!last_ptr_loop && last_ptr->block_group &&
- last_ptr->block_group != block_group &&
- index <=
- get_block_group_index(last_ptr->block_group)) {
-
- btrfs_put_block_group(block_group);
- block_group = last_ptr->block_group;
- btrfs_get_block_group(block_group);
- spin_unlock(&last_ptr->lock);
- spin_unlock(&last_ptr->refill_lock);
-
- last_ptr_loop = 1;
- search_start = block_group->key.objectid;
- /*
- * we know this block group is properly
- * in the list because
- * btrfs_remove_block_group, drops the
- * cluster before it removes the block
- * group from the list
- */
- goto have_block_group;
+ WARN_ON(last_ptr->block_group != used_block_group);
+ if (used_block_group != block_group) {
+ btrfs_put_block_group(used_block_group);
+ used_block_group = block_group;
}
- spin_unlock(&last_ptr->lock);
refill_cluster:
+ BUG_ON(used_block_group != block_group);
+ /* If we are on LOOP_NO_EMPTY_SIZE, we can't
+ * set up a new clusters, so lets just skip it
+ * and let the allocator find whatever block
+ * it can find. If we reach this point, we
+ * will have tried the cluster allocator
+ * plenty of times and not have found
+ * anything, so we are likely way too
+ * fragmented for the clustering stuff to find
+ * anything. */
+ if (loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ goto unclustered_alloc;
+ }
+
/*
* this cluster didn't work out, free it and
* start over
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
- last_ptr_loop = 0;
-
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
- offset, num_bytes,
+ search_start, num_bytes,
empty_cluster + empty_size);
if (ret == 0) {
/*
@@ -5327,6 +5383,7 @@ refill_cluster:
goto loop;
}
+unclustered_alloc:
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
/*
@@ -5353,14 +5410,14 @@ checks:
search_start = stripe_align(root, offset);
/* move on to the next group */
if (search_start + num_bytes >= search_end) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(used_block_group, offset, num_bytes);
goto loop;
}
/* move on to the next group */
if (search_start + num_bytes >
- block_group->key.objectid + block_group->key.offset) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ used_block_group->key.objectid + used_block_group->key.offset) {
+ btrfs_add_free_space(used_block_group, offset, num_bytes);
goto loop;
}
@@ -5368,14 +5425,14 @@ checks:
ins->offset = num_bytes;
if (offset < search_start)
- btrfs_add_free_space(block_group, offset,
+ btrfs_add_free_space(used_block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
- ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+ ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
alloc_type);
if (ret == -EAGAIN) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(used_block_group, offset, num_bytes);
goto loop;
}
@@ -5384,15 +5441,19 @@ checks:
ins->offset = num_bytes;
if (offset < search_start)
- btrfs_add_free_space(block_group, offset,
+ btrfs_add_free_space(used_block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
+ if (used_block_group != block_group)
+ btrfs_put_block_group(used_block_group);
btrfs_put_block_group(block_group);
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
+ if (used_block_group != block_group)
+ btrfs_put_block_group(used_block_group);
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1f87c4d0e7a..49f3c9dc09f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -935,8 +935,10 @@ again:
node = tree_search(tree, start);
if (!node) {
prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc)
- return -ENOMEM;
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
err = insert_state(tree, prealloc, start, end, &bits);
prealloc = NULL;
BUG_ON(err == -EEXIST);
@@ -992,8 +994,10 @@ hit_next:
*/
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc)
- return -ENOMEM;
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
err = split_state(tree, state, prealloc, start);
BUG_ON(err == -EEXIST);
prealloc = NULL;
@@ -1024,8 +1028,10 @@ hit_next:
this_end = last_start - 1;
prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc)
- return -ENOMEM;
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
/*
* Avoid to free 'prealloc' if it can be merged with
@@ -1051,8 +1057,10 @@ hit_next:
*/
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc)
- return -ENOMEM;
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
@@ -2285,16 +2293,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
clean_io_failure(start, page);
}
if (!uptodate) {
- u64 failed_mirror;
- failed_mirror = (u64)bio->bi_bdev;
- if (tree->ops && tree->ops->readpage_io_failed_hook)
- ret = tree->ops->readpage_io_failed_hook(
- bio, page, start, end,
- failed_mirror, state);
- else
- ret = bio_readpage_error(bio, page, start, end,
- failed_mirror, NULL);
+ int failed_mirror;
+ failed_mirror = (int)(unsigned long)bio->bi_bdev;
+ /*
+ * The generic bio_readpage_error handles errors the
+ * following way: If possible, new read requests are
+ * created and submitted and will end up in
+ * end_bio_extent_readpage as well (if we're lucky, not
+ * in the !uptodate case). In that case it returns 0 and
+ * we just go on with the next page in our bio. If it
+ * can't handle the error it will return -EIO and we
+ * remain responsible for that page.
+ */
+ ret = bio_readpage_error(bio, page, start, end,
+ failed_mirror, NULL);
if (ret == 0) {
+error_handled:
uptodate =
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
@@ -2302,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
uncache_state(&cached);
continue;
}
+ if (tree->ops && tree->ops->readpage_io_failed_hook) {
+ ret = tree->ops->readpage_io_failed_hook(
+ bio, page, start, end,
+ failed_mirror, state);
+ if (ret == 0)
+ goto error_handled;
+ }
}
if (uptodate) {
@@ -3366,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -ENOMEM;
path->leave_spinning = 1;
+ start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+ len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
+
/*
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
@@ -3413,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
&cached_state, GFP_NOFS);
- em = get_extent_skip_holes(inode, off, last_for_get_extent,
+ em = get_extent_skip_holes(inode, start, last_for_get_extent,
get_extent);
if (!em)
goto out;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index feb9be0e23b..7604c300132 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,7 +70,7 @@ struct extent_io_ops {
unsigned long bio_flags);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
- u64 start, u64 end, u64 failed_mirror,
+ u64 start, u64 end, int failed_mirror,
struct extent_state *state);
int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
u64 start, u64 end,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 7a15fcfb3e1..ec23d43d0c3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
}
}
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ clear_page_dirty_for_io(io_ctl->pages[i]);
+ set_page_extent_mapped(io_ctl->pages[i]);
+ }
+
return 0;
}
@@ -537,6 +542,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
struct btrfs_free_space *entry, u8 *type)
{
struct btrfs_free_space_entry *e;
+ int ret;
+
+ if (!io_ctl->cur) {
+ ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+ if (ret)
+ return ret;
+ }
e = io_ctl->cur;
entry->offset = le64_to_cpu(e->offset);
@@ -550,10 +562,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
io_ctl_unmap_page(io_ctl);
- if (io_ctl->index >= io_ctl->num_pages)
- return 0;
-
- return io_ctl_check_crc(io_ctl, io_ctl->index);
+ return 0;
}
static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
@@ -561,9 +570,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
{
int ret;
- if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
- io_ctl_unmap_page(io_ctl);
-
ret = io_ctl_check_crc(io_ctl, io_ctl->index);
if (ret)
return ret;
@@ -699,6 +705,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
num_entries--;
}
+ io_ctl_unmap_page(&io_ctl);
+
/*
* We add the bitmaps at the end of the entries in order that
* the bitmap entries are added to the cache.
@@ -1462,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
{
info->offset = offset_to_bitmap(ctl, offset);
info->bytes = 0;
+ INIT_LIST_HEAD(&info->list);
link_free_space(ctl, info);
ctl->total_bitmaps++;
@@ -1841,7 +1850,13 @@ again:
info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!info) {
- WARN_ON(1);
+ /* the tree logging code might be calling us before we
+ * have fully loaded the free space rbtree for this
+ * block group. So it is possible the entry won't
+ * be in the rbtree yet at all. The caching code
+ * will make sure not to put it in the rbtree if
+ * the logging code has pinned it.
+ */
goto out_lock;
}
}
@@ -2305,6 +2320,7 @@ again:
if (!found) {
start = i;
+ cluster->max_size = 0;
found = true;
}
@@ -2448,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
- struct rb_node *node;
int ret = -ENOSPC;
+ u64 bitmap_offset = offset_to_bitmap(ctl, offset);
if (ctl->total_bitmaps == 0)
return -ENOSPC;
/*
- * First check our cached list of bitmaps and see if there is an entry
- * here that will work.
+ * The bitmap that covers offset won't be in the list unless offset
+ * is just its start offset.
*/
+ entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+ if (entry->offset != bitmap_offset) {
+ entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (entry && list_empty(&entry->list))
+ list_add(&entry->list, bitmaps);
+ }
+
list_for_each_entry(entry, bitmaps, list) {
if (entry->bytes < min_bytes)
continue;
@@ -2468,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
}
/*
- * If we do have entries on our list and we are here then we didn't find
- * anything, so go ahead and get the next entry after the last entry in
- * this list and start the search from there.
+ * The bitmaps list has all the bitmaps that record free space
+ * starting after offset, so no more search is required.
*/
- if (!list_empty(bitmaps)) {
- entry = list_entry(bitmaps->prev, struct btrfs_free_space,
- list);
- node = rb_next(&entry->offset_index);
- if (!node)
- return -ENOSPC;
- entry = rb_entry(node, struct btrfs_free_space, offset_index);
- goto search;
- }
-
- entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
- if (!entry)
- return -ENOSPC;
-
-search:
- node = &entry->offset_index;
- do {
- entry = rb_entry(node, struct btrfs_free_space, offset_index);
- node = rb_next(&entry->offset_index);
- if (!entry->bitmap)
- continue;
- if (entry->bytes < min_bytes)
- continue;
- ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
- bytes, min_bytes);
- } while (ret && node);
-
- return ret;
+ return -ENOSPC;
}
/*
@@ -2517,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
u64 offset, u64 bytes, u64 empty_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct list_head bitmaps;
struct btrfs_free_space *entry, *tmp;
+ LIST_HEAD(bitmaps);
u64 min_bytes;
int ret;
@@ -2557,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
goto out;
}
- INIT_LIST_HEAD(&bitmaps);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
bytes, min_bytes);
if (ret)
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 53dcbdf446c..f8962a957d6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_path *path;
struct inode *inode;
+ struct btrfs_block_rsv *rsv;
+ u64 num_bytes;
u64 alloc_hint = 0;
int ret;
int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
if (!path)
return -ENOMEM;
+ rsv = trans->block_rsv;
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+ num_bytes = trans->bytes_reserved;
+ /*
+ * 1 item for inode item insertion if need
+ * 3 items for inode item update (in the worst case)
+ * 1 item for free space object
+ * 3 items for pre-allocation
+ */
+ trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
+ ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
+ trans->bytes_reserved);
+ if (ret)
+ goto out;
again:
inode = lookup_free_ino_inode(root, path);
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
ret = PTR_ERR(inode);
- goto out;
+ goto out_release;
}
if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
ret = create_free_ino_inode(root, trans, path);
if (ret)
- goto out;
+ goto out_release;
goto again;
}
@@ -477,11 +494,14 @@ again:
}
btrfs_free_reserved_data_space(inode, prealloc);
+ ret = btrfs_write_out_ino_cache(root, trans, path);
out_put:
iput(inode);
+out_release:
+ btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
out:
- if (ret == 0)
- ret = btrfs_write_out_ino_cache(root, trans, path);
+ trans->block_rsv = rsv;
+ trans->bytes_reserved = num_bytes;
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 966ddcc4c63..2c984f7d4c2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock);
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
@@ -1741,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode_fallback(trans, root, inode);
BUG_ON(ret);
}
goto out;
@@ -1791,7 +1793,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode_fallback(trans, root, inode);
BUG_ON(ret);
}
ret = 0;
@@ -2199,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
if (ret)
goto out;
}
+ /* release the path since we're done with it */
+ btrfs_release_path(path);
+
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
if (root->orphan_block_rsv)
@@ -2426,7 +2431,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
/*
* copy everything in the in-memory inode into the btree.
*/
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode)
{
struct btrfs_inode_item *inode_item;
@@ -2434,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int ret;
- /*
- * If the inode is a free space inode, we can deadlock during commit
- * if we put it into the delayed code.
- *
- * The data relocation inode should also be directly updated
- * without delay
- */
- if (!btrfs_is_free_space_inode(root, inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
- ret = btrfs_delayed_update_inode(trans, root, inode);
- if (!ret)
- btrfs_set_inode_last_trans(trans, inode);
- return ret;
- }
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2477,6 +2467,43 @@ failed:
}
/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ int ret;
+
+ /*
+ * If the inode is a free space inode, we can deadlock during commit
+ * if we put it into the delayed code.
+ *
+ * The data relocation inode should also be directly updated
+ * without delay
+ */
+ if (!btrfs_is_free_space_inode(root, inode)
+ && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ ret = btrfs_delayed_update_inode(trans, root, inode);
+ if (!ret)
+ btrfs_set_inode_last_trans(trans, inode);
+ return ret;
+ }
+
+ return btrfs_update_inode_item(trans, root, inode);
+}
+
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ int ret;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret == -ENOSPC)
+ return btrfs_update_inode_item(trans, root, inode);
+ return ret;
+}
+
+/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
* also drops the back refs in the inode to the directory
@@ -3463,7 +3490,7 @@ void btrfs_evict_inode(struct inode *inode)
* doing the truncate.
*/
while (1) {
- ret = btrfs_block_rsv_refill(root, rsv, min_size);
+ ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
/*
* Try and steal from the global reserve since we will
@@ -5632,7 +5659,7 @@ again:
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
ret = btrfs_ordered_update_i_size(inode, 0, ordered);
if (!ret)
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode_fallback(trans, root, inode);
goto out;
}
@@ -5670,7 +5697,7 @@ again:
add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
ret = btrfs_ordered_update_i_size(inode, 0, ordered);
if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode_fallback(trans, root, inode);
ret = 0;
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6529,14 +6556,16 @@ end_trans:
ret = btrfs_orphan_del(NULL, inode);
}
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret && !err)
- err = ret;
+ if (trans) {
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && !err)
+ err = ret;
- nr = trans->blocks_used;
- ret = btrfs_end_transaction_throttle(trans, root);
- btrfs_btree_balance_dirty(root, nr);
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction_throttle(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+ }
out:
btrfs_free_block_rsv(root, rsv);
@@ -6605,6 +6634,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->orphan_meta_reserved = 0;
ei->dummy_inode = 0;
ei->in_defrag = 0;
+ ei->delalloc_meta_reserved = 0;
ei->force_compress = BTRFS_COMPRESS_NONE;
ei->delayed_node = NULL;
@@ -6764,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
+ u32 blocksize = inode->i_sb->s_blocksize;
+
generic_fillattr(inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
stat->blksize = PAGE_CACHE_SIZE;
- stat->blocks = (inode_get_bytes(inode) +
- BTRFS_I(inode)->delalloc_bytes) >> 9;
+ stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+ ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
return 0;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4a34c472f12..72d461656f6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1216,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
*devstr = '\0';
devstr = vol_args->name;
devid = simple_strtoull(devstr, &end, 10);
- printk(KERN_INFO "resizing devid %llu\n",
+ printk(KERN_INFO "btrfs: resizing devid %llu\n",
(unsigned long long)devid);
}
device = btrfs_find_device(root, devid, NULL, NULL);
if (!device) {
- printk(KERN_INFO "resizer unable to find device %llu\n",
+ printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
goto out_unlock;
@@ -1267,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
do_div(new_size, root->sectorsize);
new_size *= root->sectorsize;
- printk(KERN_INFO "new size for %s is %llu\n",
+ printk(KERN_INFO "btrfs: new size for %s is %llu\n",
device->name, (unsigned long long)new_size);
if (new_size > old_size) {
@@ -1278,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
- } else {
+ } else if (new_size < old_size) {
ret = btrfs_shrink_device(device, new_size);
}
@@ -2930,11 +2930,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
goto out;
for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
- rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
+ rel_ptr = ipath->fspath->val[i] -
+ (u64)(unsigned long)ipath->fspath->val;
ipath->fspath->val[i] = rel_ptr;
}
- ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
+ ret = copy_to_user((void *)(unsigned long)ipa->fspath,
+ (void *)(unsigned long)ipath->fspath, size);
if (ret) {
ret = -EFAULT;
goto out;
@@ -3017,7 +3019,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
if (ret < 0)
goto out;
- ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
+ ret = copy_to_user((void *)(unsigned long)loi->inodes,
+ (void *)(unsigned long)inodes, size);
if (ret)
ret = -EFAULT;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 24d654ce7a0..dff29d5e151 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
list_add_tail(&new_edge->list[UPPER],
&new_node->lower);
}
+ } else {
+ list_add_tail(&new_node->lower, &cache->leaves);
}
rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ed11d3866af..c27bcb67f33 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
btrfs_release_path(swarn->path);
ipath = init_ipath(4096, local_root, swarn->path);
+ if (IS_ERR(ipath)) {
+ ret = PTR_ERR(ipath);
+ ipath = NULL;
+ goto err;
+ }
ret = paths_from_inode(inum, ipath);
if (ret < 0)
@@ -272,7 +277,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
swarn->logical, swarn->dev->name,
(unsigned long long)swarn->sector, root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
- (char *)ipath->fspath->val[i]);
+ (char *)(unsigned long)ipath->fspath->val[i]);
free_ipath(ipath);
return 0;
@@ -944,50 +949,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
static int scrub_submit(struct scrub_dev *sdev)
{
struct scrub_bio *sbio;
- struct bio *bio;
- int i;
if (sdev->curr == -1)
return 0;
sbio = sdev->bios[sdev->curr];
-
- bio = bio_alloc(GFP_NOFS, sbio->count);
- if (!bio)
- goto nomem;
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_bio_end_io;
- bio->bi_bdev = sdev->dev->bdev;
- bio->bi_sector = sbio->physical >> 9;
-
- for (i = 0; i < sbio->count; ++i) {
- struct page *page;
- int ret;
-
- page = alloc_page(GFP_NOFS);
- if (!page)
- goto nomem;
-
- ret = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (!ret) {
- __free_page(page);
- goto nomem;
- }
- }
-
sbio->err = 0;
sdev->curr = -1;
atomic_inc(&sdev->in_flight);
- submit_bio(READ, bio);
+ submit_bio(READ, sbio->bio);
return 0;
-
-nomem:
- scrub_free_bio(bio);
-
- return -ENOMEM;
}
static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -995,6 +968,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
u8 *csum, int force)
{
struct scrub_bio *sbio;
+ struct page *page;
+ int ret;
again:
/*
@@ -1015,12 +990,22 @@ again:
}
sbio = sdev->bios[sdev->curr];
if (sbio->count == 0) {
+ struct bio *bio;
+
sbio->physical = physical;
sbio->logical = logical;
+ bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_private = sbio;
+ bio->bi_end_io = scrub_bio_end_io;
+ bio->bi_bdev = sdev->dev->bdev;
+ bio->bi_sector = sbio->physical >> 9;
+ sbio->err = 0;
+ sbio->bio = bio;
} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
sbio->logical + sbio->count * PAGE_SIZE != logical) {
- int ret;
-
ret = scrub_submit(sdev);
if (ret)
return ret;
@@ -1030,6 +1015,20 @@ again:
sbio->spag[sbio->count].generation = gen;
sbio->spag[sbio->count].have_csum = 0;
sbio->spag[sbio->count].mirror_num = mirror_num;
+
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+
+ ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
+ if (!ret) {
+ __free_page(page);
+ ret = scrub_submit(sdev);
+ if (ret)
+ return ret;
+ goto again;
+ }
+
if (csum) {
sbio->spag[sbio->count].have_csum = 1;
memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 57080dffdfc..e28ad4baf48 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -197,7 +197,7 @@ static match_table_t tokens = {
{Opt_subvolrootid, "subvolrootid=%d"},
{Opt_defrag, "autodefrag"},
{Opt_inode_cache, "inode_cache"},
- {Opt_no_space_cache, "no_space_cache"},
+ {Opt_no_space_cache, "nospace_cache"},
{Opt_recovery, "recovery"},
{Opt_err, NULL},
};
@@ -448,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
token = match_token(p, tokens, args);
switch (token) {
case Opt_subvol:
+ kfree(*subvol_name);
*subvol_name = match_strdup(&args[0]);
break;
case Opt_subvolid:
@@ -710,7 +711,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (btrfs_test_opt(root, SPACE_CACHE))
seq_puts(seq, ",space_cache");
else
- seq_puts(seq, ",no_space_cache");
+ seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(root, CLEAR_CACHE))
seq_puts(seq, ",clear_cache");
if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -824,13 +825,9 @@ static char *setup_root_args(char *args)
static struct dentry *mount_subvol(const char *subvol_name, int flags,
const char *device_name, char *data)
{
- struct super_block *s;
struct dentry *root;
struct vfsmount *mnt;
- struct mnt_namespace *ns_private;
char *newargs;
- struct path path;
- int error;
newargs = setup_root_args(data);
if (!newargs)
@@ -841,39 +838,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
if (IS_ERR(mnt))
return ERR_CAST(mnt);
- ns_private = create_mnt_ns(mnt);
- if (IS_ERR(ns_private)) {
- mntput(mnt);
- return ERR_CAST(ns_private);
- }
-
- /*
- * This will trigger the automount of the subvol so we can just
- * drop the mnt we have here and return the dentry that we
- * found.
- */
- error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
- LOOKUP_FOLLOW, &path);
- put_mnt_ns(ns_private);
- if (error)
- return ERR_PTR(error);
+ root = mount_subtree(mnt, subvol_name);
- if (!is_subvolume_inode(path.dentry->d_inode)) {
- path_put(&path);
- mntput(mnt);
- error = -EINVAL;
+ if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+ struct super_block *s = root->d_sb;
+ dput(root);
+ root = ERR_PTR(-EINVAL);
+ deactivate_locked_super(s);
printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
subvol_name);
- return ERR_PTR(-EINVAL);
}
- /* Get a ref to the sb and the dentry we found and return it */
- s = path.mnt->mnt_sb;
- atomic_inc(&s->s_active);
- root = dget(path.dentry);
- path_put(&path);
- down_write(&s->s_umount);
-
return root;
}
@@ -890,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
struct super_block *s;
struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
- struct btrfs_root *tree_root = NULL;
struct btrfs_fs_info *fs_info = NULL;
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
@@ -904,8 +878,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error = btrfs_parse_early_options(data, mode, fs_type,
&subvol_name, &subvol_objectid,
&subvol_rootid, &fs_devices);
- if (error)
+ if (error) {
+ kfree(subvol_name);
return ERR_PTR(error);
+ }
if (subvol_name) {
root = mount_subvol(subvol_name, flags, device_name, data);
@@ -917,15 +893,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (error)
return ERR_PTR(error);
- error = btrfs_open_devices(fs_devices, mode, fs_type);
- if (error)
- return ERR_PTR(error);
-
- if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
- error = -EACCES;
- goto error_close_devices;
- }
-
/*
* Setup a dummy root and fs_info for test/set super. This is because
* we don't actually fill this stuff out until open_ctree, but we need
@@ -933,24 +900,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
* then open_ctree will properly initialize everything later.
*/
fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
- tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- if (!fs_info || !tree_root) {
+ if (!fs_info)
+ return ERR_PTR(-ENOMEM);
+
+ fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ if (!fs_info->tree_root) {
error = -ENOMEM;
- goto error_close_devices;
+ goto error_fs_info;
}
- fs_info->tree_root = tree_root;
+ fs_info->tree_root->fs_info = fs_info;
fs_info->fs_devices = fs_devices;
- tree_root->fs_info = fs_info;
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
error = -ENOMEM;
+ goto error_fs_info;
+ }
+
+ error = btrfs_open_devices(fs_devices, mode, fs_type);
+ if (error)
+ goto error_fs_info;
+
+ if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+ error = -EACCES;
goto error_close_devices;
}
bdev = fs_devices->latest_bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
+ s = sget(fs_type, btrfs_test_super, btrfs_set_super,
+ fs_info->tree_root);
if (IS_ERR(s)) {
error = PTR_ERR(s);
goto error_close_devices;
@@ -959,12 +938,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (s->s_root) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
deactivate_locked_super(s);
- return ERR_PTR(-EBUSY);
+ error = -EBUSY;
+ goto error_close_devices;
}
btrfs_close_devices(fs_devices);
free_fs_info(fs_info);
- kfree(tree_root);
} else {
char b[BDEVNAME_SIZE];
@@ -991,8 +970,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error_close_devices:
btrfs_close_devices(fs_devices);
+error_fs_info:
free_fs_info(fs_info);
- kfree(tree_root);
return ERR_PTR(error);
}
@@ -1078,7 +1057,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
int i = 0, nr_devices;
int ret;
- nr_devices = fs_info->fs_devices->rw_devices;
+ nr_devices = fs_info->fs_devices->open_devices;
BUG_ON(!nr_devices);
devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1100,8 +1079,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
else
min_stripe_size = BTRFS_STRIPE_LEN;
- list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
- if (!device->in_fs_metadata)
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->in_fs_metadata || !device->bdev)
continue;
avail_space = device->total_bytes - device->bytes_used;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 960835eaf4d..81376d94cd3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_save_ino_cache(root, trans);
+ /* see comments in should_cow_block() */
+ root->force_cow = 0;
+ smp_wmb();
+
if (root->commit_root != root->node) {
mutex_lock(&root->fs_commit_mutex);
switch_commit_root(root);
@@ -882,8 +886,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
if (to_reserve > 0) {
- ret = btrfs_block_rsv_add(root, &pending->block_rsv,
- to_reserve);
+ ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
+ to_reserve);
if (ret) {
pending->error = ret;
goto fail;
@@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(old);
free_extent_buffer(old);
+ /* see comments in should_cow_block() */
+ root->force_cow = 1;
+ smp_wmb();
+
btrfs_set_root_node(new_root_item, tmp);
/* record when the snapshot was created in key.offset */
key.offset = trans->transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f8e2943101a..0a8c8f8304b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -999,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
-
+again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
@@ -1012,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_dev_extent);
BUG_ON(found_key.offset > start || found_key.offset +
btrfs_dev_extent_length(leaf, extent) < start);
+ key = found_key;
+ btrfs_release_path(path);
+ goto again;
} else if (ret == 0) {
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1608,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
return -EINVAL;
- bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+ bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ab5b1c49f35..78f2d4d4f37 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -100,6 +100,12 @@ struct btrfs_device {
struct reada_zone *reada_curr_zone;
struct radix_tree_root reada_zones;
struct radix_tree_root reada_extents;
+
+ /* for sending down flush barriers */
+ struct bio *flush_bio;
+ struct completion flush_wait;
+ int nobarriers;
+
};
struct btrfs_fs_devices {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2abd0dfad7f..bca3948e9db 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1143,7 +1143,7 @@ static void ceph_d_prune(struct dentry *dentry)
{
struct ceph_dentry_info *di;
- dout("d_release %p\n", dentry);
+ dout("ceph_d_prune %p\n", dentry);
/* do we have a valid parent? */
if (!dentry->d_parent || IS_ROOT(dentry))
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e392bfce84a..116f36502f1 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1328,12 +1328,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
*/
void ceph_queue_writeback(struct inode *inode)
{
+ ihold(inode);
if (queue_work(ceph_inode_to_client(inode)->wb_wq,
&ceph_inode(inode)->i_wb_work)) {
dout("ceph_queue_writeback %p\n", inode);
- ihold(inode);
} else {
dout("ceph_queue_writeback %p failed\n", inode);
+ iput(inode);
}
}
@@ -1353,12 +1354,13 @@ static void ceph_writeback_work(struct work_struct *work)
*/
void ceph_queue_invalidate(struct inode *inode)
{
+ ihold(inode);
if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
&ceph_inode(inode)->i_pg_inv_work)) {
dout("ceph_queue_invalidate %p\n", inode);
- ihold(inode);
} else {
dout("ceph_queue_invalidate %p failed\n", inode);
+ iput(inode);
}
}
@@ -1434,13 +1436,14 @@ void ceph_queue_vmtruncate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ ihold(inode);
if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
&ci->i_vmtruncate_work)) {
dout("ceph_queue_vmtruncate %p\n", inode);
- ihold(inode);
} else {
dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
inode, ci->i_truncate_pending);
+ iput(inode);
}
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a90846fac75..8dc73a594a9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -638,10 +638,12 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
if (err == 0) {
dout("open_root_inode success\n");
if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
- fsc->sb->s_root == NULL)
+ fsc->sb->s_root == NULL) {
root = d_alloc_root(req->r_target_inode);
- else
+ ceph_init_dentry(root);
+ } else {
root = d_obtain_alias(req->r_target_inode);
+ }
req->r_target_inode = NULL;
dout("open_root_inode success, root dentry is %p\n", root);
} else {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d6a972df033..8cd4b52d421 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -441,6 +441,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
smb_msg.msg_controllen = 0;
for (total_read = 0; to_read; total_read += length, to_read -= length) {
+ try_to_freeze();
+
if (server_unresponsive(server)) {
total_read = -EAGAIN;
break;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cf0b1539b32..4dd9283885e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -702,6 +702,13 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
lock->type, lock->netfid, conf_lock);
}
+/*
+ * Check if there is another lock that prevents us to set the lock (mandatory
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
static int
cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
__u8 type, __u16 netfid, struct file_lock *flock)
@@ -739,6 +746,12 @@ cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
mutex_unlock(&cinode->lock_mutex);
}
+/*
+ * Set the byte-range lock (mandatory style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if no locks prevent us but we need to request to the server;
+ * 3) -EACCESS, if there is a lock that prevents us and wait is false.
+ */
static int
cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
bool wait)
@@ -778,6 +791,13 @@ try_again:
return rc;
}
+/*
+ * Check if there is another lock that prevents us to set the lock (posix
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
static int
cifs_posix_lock_test(struct file *file, struct file_lock *flock)
{
@@ -800,6 +820,12 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
return rc;
}
+/*
+ * Set the byte-range lock (posix style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if we need to request to the server;
+ * 3) <0, if the error occurs while setting the lock.
+ */
static int
cifs_posix_lock_set(struct file *file, struct file_lock *flock)
{
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5de03ec2014..a090bbe6ee2 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
rc);
return rc;
}
- cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+ /* FindFirst/Next set last_entry to NULL on malformed reply */
+ if (cifsFile->srch_inf.last_entry)
+ cifs_save_resume_key(cifsFile->srch_inf.last_entry,
+ cifsFile);
}
while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
@@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
cFYI(1, "calling findnext2");
rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
&cifsFile->srch_inf);
- cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+ /* FindFirst/Next set last_entry to NULL on malformed reply */
+ if (cifsFile->srch_inf.last_entry)
+ cifs_save_resume_key(cifsFile->srch_inf.last_entry,
+ cifsFile);
if (rc)
return -ENOENT;
}
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 7cacba12b8f..80d85088193 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -209,7 +209,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
{
int rc;
int len;
- __u16 wpwd[129];
+ __le16 wpwd[129];
/* Password cannot be longer than 128 characters */
if (passwd) /* Password must be converted to NT unicode */
@@ -219,8 +219,8 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
*wpwd = 0; /* Ensure string is null terminated */
}
- rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__u16));
- memset(wpwd, 0, 129 * sizeof(__u16));
+ rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
+ memset(wpwd, 0, 129 * sizeof(__le16));
return rc;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index a901c6901bc..89509b5a090 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -36,6 +36,7 @@
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/prefetch.h>
+#include <linux/ratelimit.h>
#include "internal.h"
/*
@@ -2383,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
actual = __d_unalias(inode, dentry, alias);
}
write_sequnlock(&rename_lock);
- if (IS_ERR(actual))
+ if (IS_ERR(actual)) {
+ if (PTR_ERR(actual) == -ELOOP)
+ pr_warn_ratelimited(
+ "VFS: Lookup of '%s' in %s %s"
+ " would have caused loop\n",
+ dentry->d_name.name,
+ inode->i_sb->s_type->name,
+ inode->i_sb->s_id);
dput(alias);
+ }
goto out_nolock;
}
}
@@ -2430,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
/**
* prepend_path - Prepend path string to a buffer
* @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry (may be modified by this function)
+ * @root: root vfsmnt/dentry
* @buffer: pointer to the end of the buffer
* @buflen: pointer to buffer length
*
* Caller holds the rename_lock.
- *
- * If path is not reachable from the supplied root, then the value of
- * root is changed (without modifying refcounts).
*/
-static int prepend_path(const struct path *path, struct path *root,
+static int prepend_path(const struct path *path,
+ const struct path *root,
char **buffer, int *buflen)
{
struct dentry *dentry = path->dentry;
@@ -2474,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root,
dentry = parent;
}
-out:
if (!error && !slash)
error = prepend(buffer, buflen, "/", 1);
+out:
br_read_unlock(vfsmount_lock);
return error;
@@ -2491,15 +2498,17 @@ global_root:
WARN(1, "Root dentry has weird name <%.*s>\n",
(int) dentry->d_name.len, dentry->d_name.name);
}
- root->mnt = vfsmnt;
- root->dentry = dentry;
+ if (!slash)
+ error = prepend(buffer, buflen, "/", 1);
+ if (!error)
+ error = vfsmnt->mnt_ns ? 1 : 2;
goto out;
}
/**
* __d_path - return the path of a dentry
* @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry (may be modified by this function)
+ * @root: root vfsmnt/dentry
* @buf: buffer to return value in
* @buflen: buffer length
*
@@ -2510,10 +2519,10 @@ global_root:
*
* "buflen" should be positive.
*
- * If path is not reachable from the supplied root, then the value of
- * root is changed (without modifying refcounts).
+ * If the path is not reachable from the supplied root, return %NULL.
*/
-char *__d_path(const struct path *path, struct path *root,
+char *__d_path(const struct path *path,
+ const struct path *root,
char *buf, int buflen)
{
char *res = buf + buflen;
@@ -2524,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root,
error = prepend_path(path, root, &res, &buflen);
write_sequnlock(&rename_lock);
- if (error)
+ if (error < 0)
+ return ERR_PTR(error);
+ if (error > 0)
+ return NULL;
+ return res;
+}
+
+char *d_absolute_path(const struct path *path,
+ char *buf, int buflen)
+{
+ struct path root = {};
+ char *res = buf + buflen;
+ int error;
+
+ prepend(&res, &buflen, "\0", 1);
+ write_seqlock(&rename_lock);
+ error = prepend_path(path, &root, &res, &buflen);
+ write_sequnlock(&rename_lock);
+
+ if (error > 1)
+ error = -EINVAL;
+ if (error < 0)
return ERR_PTR(error);
return res;
}
@@ -2532,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root,
/*
* same as __d_path but appends "(deleted)" for unlinked files.
*/
-static int path_with_deleted(const struct path *path, struct path *root,
- char **buf, int *buflen)
+static int path_with_deleted(const struct path *path,
+ const struct path *root,
+ char **buf, int *buflen)
{
prepend(buf, buflen, "\0", 1);
if (d_unlinked(path->dentry)) {
@@ -2570,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen)
{
char *res = buf + buflen;
struct path root;
- struct path tmp;
int error;
/*
@@ -2585,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen)
get_fs_root(current->fs, &root);
write_seqlock(&rename_lock);
- tmp = root;
- error = path_with_deleted(path, &tmp, &res, &buflen);
- if (error)
+ error = path_with_deleted(path, &root, &res, &buflen);
+ if (error < 0)
res = ERR_PTR(error);
write_sequnlock(&rename_lock);
path_put(&root);
@@ -2608,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
{
char *res = buf + buflen;
struct path root;
- struct path tmp;
int error;
if (path->dentry->d_op && path->dentry->d_op->d_dname)
@@ -2616,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
get_fs_root(current->fs, &root);
write_seqlock(&rename_lock);
- tmp = root;
- error = path_with_deleted(path, &tmp, &res, &buflen);
- if (!error && !path_equal(&tmp, &root))
+ error = path_with_deleted(path, &root, &res, &buflen);
+ if (error > 0)
error = prepend_unreachable(&res, &buflen);
write_sequnlock(&rename_lock);
path_put(&root);
@@ -2749,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
write_seqlock(&rename_lock);
if (!d_unlinked(pwd.dentry)) {
unsigned long len;
- struct path tmp = root;
char *cwd = page + PAGE_SIZE;
int buflen = PAGE_SIZE;
prepend(&cwd, &buflen, "\0", 1);
- error = prepend_path(&pwd, &tmp, &cwd, &buflen);
+ error = prepend_path(&pwd, &root, &cwd, &buflen);
write_sequnlock(&rename_lock);
- if (error)
+ if (error < 0)
goto out;
/* Unreachable from current root */
- if (!path_equal(&tmp, &root)) {
+ if (error > 0) {
error = prepend_unreachable(&cwd, &buflen);
if (error)
goto out;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 58609bde3b9..2a834255c75 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals(
/**
* ecryptfs_new_file_context
- * @ecryptfs_dentry: The eCryptfs dentry
+ * @ecryptfs_inode: The eCryptfs inode
*
* If the crypto context for the file has not yet been established,
* this is where we do that. Establishing a new crypto context
@@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals(
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry)
+int ecryptfs_new_file_context(struct inode *ecryptfs_inode)
{
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+ &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
&ecryptfs_superblock_to_private(
- ecryptfs_dentry->d_sb)->mount_crypt_stat;
+ ecryptfs_inode->i_sb)->mount_crypt_stat;
int cipher_name_len;
int rc = 0;
@@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
}
static int
-ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
+ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
char *virt, size_t virt_len)
{
int rc;
- rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
+ rc = ecryptfs_write_lower(ecryptfs_inode, virt,
0, virt_len);
if (rc < 0)
printk(KERN_ERR "%s: Error attempting to write header "
@@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
/**
* ecryptfs_write_metadata
- * @ecryptfs_dentry: The eCryptfs dentry
+ * @ecryptfs_dentry: The eCryptfs dentry, which should be negative
+ * @ecryptfs_inode: The newly created eCryptfs inode
*
* Write the file headers out. This will likely involve a userspace
* callout, in which the session key is encrypted with one or more
@@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
*
* Returns zero on success; non-zero on error
*/
-int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
+ struct inode *ecryptfs_inode)
{
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+ &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
unsigned int order;
char *virt;
size_t virt_len;
@@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
size);
else
- rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt,
+ rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
virt_len);
if (rc) {
printk(KERN_ERR "%s: Error writing metadata out to lower file; "
@@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
/* We could either offset on every reverse map or just pad some 0x00's
* at the front here */
-static const unsigned char filename_rev_map[] = {
+static const unsigned char filename_rev_map[256] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
@@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = {
0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
- 0x3D, 0x3E, 0x3F
+ 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */
};
/**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 54481a3b2c7..a9f29b12fbf 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
int ecryptfs_encrypt_page(struct page *page);
int ecryptfs_decrypt_page(struct page *page);
-int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
+int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
+ struct inode *ecryptfs_inode);
int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
-int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+int ecryptfs_new_file_context(struct inode *ecryptfs_inode);
void ecryptfs_write_crypt_stat_flags(char *page_virt,
struct ecryptfs_crypt_stat *crypt_stat,
size_t *written);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index c6ac98cf9ba..d3f95f941c4 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -139,6 +139,27 @@ out:
return rc;
}
+static void ecryptfs_vma_close(struct vm_area_struct *vma)
+{
+ filemap_write_and_wait(vma->vm_file->f_mapping);
+}
+
+static const struct vm_operations_struct ecryptfs_file_vm_ops = {
+ .close = ecryptfs_vma_close,
+ .fault = filemap_fault,
+};
+
+static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int rc;
+
+ rc = generic_file_mmap(file, vma);
+ if (!rc)
+ vma->vm_ops = &ecryptfs_file_vm_ops;
+
+ return rc;
+}
+
struct kmem_cache *ecryptfs_file_info_cache;
/**
@@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ecryptfs_file_mmap,
.open = ecryptfs_open,
.flush = ecryptfs_flush,
.release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a36d327f152..32f90a3ae63 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
* it. It will also update the eCryptfs directory inode to mimic the
* stat of the lower directory inode.
*
- * Returns zero on success; non-zero on error condition
+ * Returns the new eCryptfs inode on success; an ERR_PTR on error condition
*/
-static int
+static struct inode *
ecryptfs_do_create(struct inode *directory_inode,
struct dentry *ecryptfs_dentry, int mode)
{
int rc;
struct dentry *lower_dentry;
struct dentry *lower_dir_dentry;
+ struct inode *inode;
lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
if (IS_ERR(lower_dir_dentry)) {
ecryptfs_printk(KERN_ERR, "Error locking directory of "
"dentry\n");
- rc = PTR_ERR(lower_dir_dentry);
+ inode = ERR_CAST(lower_dir_dentry);
goto out;
}
rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
@@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode,
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
"rc = [%d]\n", __func__, rc);
+ inode = ERR_PTR(rc);
goto out_lock;
}
- rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
- directory_inode->i_sb);
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
+ inode = __ecryptfs_get_inode(lower_dentry->d_inode,
+ directory_inode->i_sb);
+ if (IS_ERR(inode))
goto out_lock;
- }
fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
out_lock:
unlock_dir(lower_dir_dentry);
out:
- return rc;
+ return inode;
}
/**
@@ -219,26 +219,26 @@ out:
*
* Returns zero on success
*/
-static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
+static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
+ struct inode *ecryptfs_inode)
{
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+ &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
int rc = 0;
- if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+ if (S_ISDIR(ecryptfs_inode->i_mode)) {
ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
goto out;
}
ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
- rc = ecryptfs_new_file_context(ecryptfs_dentry);
+ rc = ecryptfs_new_file_context(ecryptfs_inode);
if (rc) {
ecryptfs_printk(KERN_ERR, "Error creating new file "
"context; rc = [%d]\n", rc);
goto out;
}
- rc = ecryptfs_get_lower_file(ecryptfs_dentry,
- ecryptfs_dentry->d_inode);
+ rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode);
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
@@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
ecryptfs_dentry->d_name.name, rc);
goto out;
}
- rc = ecryptfs_write_metadata(ecryptfs_dentry);
+ rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
if (rc)
printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
- ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
+ ecryptfs_put_lower_file(ecryptfs_inode);
out:
return rc;
}
@@ -269,18 +269,28 @@ static int
ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
int mode, struct nameidata *nd)
{
+ struct inode *ecryptfs_inode;
int rc;
- /* ecryptfs_do_create() calls ecryptfs_interpose() */
- rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode);
- if (unlikely(rc)) {
+ ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
+ mode);
+ if (unlikely(IS_ERR(ecryptfs_inode))) {
ecryptfs_printk(KERN_WARNING, "Failed to create file in"
"lower filesystem\n");
+ rc = PTR_ERR(ecryptfs_inode);
goto out;
}
/* At this point, a file exists on "disk"; we need to make sure
* that this on disk file is prepared to be an ecryptfs file */
- rc = ecryptfs_initialize_file(ecryptfs_dentry);
+ rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode);
+ if (rc) {
+ drop_nlink(ecryptfs_inode);
+ unlock_new_inode(ecryptfs_inode);
+ iput(ecryptfs_inode);
+ goto out;
+ }
+ d_instantiate(ecryptfs_dentry, ecryptfs_inode);
+ unlock_new_inode(ecryptfs_inode);
out:
return rc;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f6dba4505f1..12ccacda44e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
brelse(bitmap_bh);
printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
", computed = %llu, %llu\n",
- EXT4_B2C(sbi, ext4_free_blocks_count(es)),
+ EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
desc_count, bitmap_count);
return bitmap_count;
#else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 240f6e2dc7e..848f436df29 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2270,6 +2270,7 @@ retry:
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
+ blk_finish_plug(&plug);
goto out_writepages;
}
@@ -2806,8 +2807,8 @@ out:
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
/* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
iocb->private = NULL;
+ queue_work(wq, &io_end->work);
/* XXX: probably should move into the real I/O completion handler */
inode_dio_done(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9953d80145a..3858767ec67 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1683,7 +1683,9 @@ static int parse_options(char *options, struct super_block *sb,
data_opt = EXT4_MOUNT_WRITEBACK_DATA;
datacheck:
if (is_remount) {
- if (test_opt(sb, DATA_FLAGS) != data_opt) {
+ if (!sbi->s_journal)
+ ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+ else if (test_opt(sb, DATA_FLAGS) != data_opt) {
ext4_msg(sb, KERN_ERR,
"Cannot change data mode on remount");
return 0;
@@ -3099,8 +3101,6 @@ static void ext4_destroy_lazyinit_thread(void)
}
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
- __releases(kernel_lock)
- __acquires(kernel_lock)
{
char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh;
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index e673a88b8ae..b1ce4c7ad3f 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
src = in->name;
srclen = in->len;
+ if (srclen > HFS_NAMELEN)
+ srclen = HFS_NAMELEN;
dst = out;
dstlen = HFS_MAX_NAMELEN;
if (nls_io) {
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3f32bcb0d9b..ef175cb8cfd 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -16,38 +16,26 @@
#include <linux/bitops.h>
#include <linux/sched.h>
-static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
-
static DEFINE_SPINLOCK(bitmap_lock);
-static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits)
+/*
+ * bitmap consists of blocks filled with 16bit words
+ * bit set == busy, bit clear == free
+ * endianness is a mess, but for counting zero bits it really doesn't matter...
+ */
+static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits)
{
- unsigned i, j, sum = 0;
- struct buffer_head *bh;
-
- for (i=0; i<numblocks-1; i++) {
- if (!(bh=map[i]))
- return(0);
- for (j=0; j<bh->b_size; j++)
- sum += nibblemap[bh->b_data[j] & 0xf]
- + nibblemap[(bh->b_data[j]>>4) & 0xf];
- }
+ __u32 sum = 0;
+ unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8);
- if (numblocks==0 || !(bh=map[numblocks-1]))
- return(0);
- i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2;
- for (j=0; j<i; j++) {
- sum += nibblemap[bh->b_data[j] & 0xf]
- + nibblemap[(bh->b_data[j]>>4) & 0xf];
+ while (blocks--) {
+ unsigned words = blocksize / 2;
+ __u16 *p = (__u16 *)(*map++)->b_data;
+ while (words--)
+ sum += 16 - hweight16(*p++);
}
- i = numbits%16;
- if (i!=0) {
- i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1);
- sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf];
- sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf];
- }
- return(sum);
+ return sum;
}
void minix_free_block(struct inode *inode, unsigned long block)
@@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode)
return 0;
}
-unsigned long minix_count_free_blocks(struct minix_sb_info *sbi)
+unsigned long minix_count_free_blocks(struct super_block *sb)
{
- return (count_free(sbi->s_zmap, sbi->s_zmap_blocks,
- sbi->s_nzones - sbi->s_firstdatazone + 1)
+ struct minix_sb_info *sbi = minix_sb(sb);
+ u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1);
+
+ return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
<< sbi->s_log_zone_size);
}
@@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
return inode;
}
-unsigned long minix_count_free_inodes(struct minix_sb_info *sbi)
+unsigned long minix_count_free_inodes(struct super_block *sb)
{
- return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1);
+ struct minix_sb_info *sbi = minix_sb(sb);
+ u32 bits = sbi->s_ninodes + 1;
+
+ return count_free(sbi->s_imap, sb->s_blocksize, bits);
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 64cdcd662ff..1d9e33966db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
else if (sbi->s_mount_state & MINIX_ERROR_FS)
printk("MINIX-fs: mounting file system with errors, "
"running fsck is recommended\n");
+
+ /* Apparently minix can create filesystems that allocate more blocks for
+ * the bitmaps than needed. We simply ignore that, but verify it didn't
+ * create one with not enough blocks and bail out if so.
+ */
+ block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
+ if (sbi->s_imap_blocks < block) {
+ printk("MINIX-fs: file system does not have enough "
+ "imap blocks allocated. Refusing to mount\n");
+ goto out_iput;
+ }
+
+ block = minix_blocks_needed(
+ (sbi->s_nzones - (sbi->s_firstdatazone + 1)),
+ s->s_blocksize);
+ if (sbi->s_zmap_blocks < block) {
+ printk("MINIX-fs: file system does not have enough "
+ "zmap blocks allocated. Refusing to mount.\n");
+ goto out_iput;
+ }
+
return 0;
out_iput:
@@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = sb->s_magic;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
- buf->f_bfree = minix_count_free_blocks(sbi);
+ buf->f_bfree = minix_count_free_blocks(sb);
buf->f_bavail = buf->f_bfree;
buf->f_files = sbi->s_ninodes;
- buf->f_ffree = minix_count_free_inodes(sbi);
+ buf->f_ffree = minix_count_free_inodes(sb);
buf->f_namelen = sbi->s_namelen;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 341e2122879..26bbd55e82e 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru
extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
extern struct inode * minix_new_inode(const struct inode *, int, int *);
extern void minix_free_inode(struct inode * inode);
-extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
+extern unsigned long minix_count_free_inodes(struct super_block *sb);
extern int minix_new_block(struct inode * inode);
extern void minix_free_block(struct inode *inode, unsigned long block);
-extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi);
+extern unsigned long minix_count_free_blocks(struct super_block *sb);
extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
@@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
return list_entry(inode, struct minix_inode_info, vfs_inode);
}
+static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
+{
+ return DIV_ROUND_UP(bits, blocksize * 8);
+}
+
#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
@@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
if (!size)
return 0;
- size = (size >> 4) + ((size & 15) > 0);
+ size >>= 4;
while (*p++ == 0xffff) {
if (--size == 0)
return (p - addr) << 4;
diff --git a/fs/namespace.c b/fs/namespace.c
index e5e1c7d1839..cfc6d4448aa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
if (err)
goto out;
seq_putc(m, ' ');
- seq_path_root(m, &mnt_path, &root, " \t\n\\");
- if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
- /*
- * Mountpoint is outside root, discard that one. Ugly,
- * but less so than trying to do that in iterator in a
- * race-free way (due to renames).
- */
- return SEQ_SKIP;
- }
+
+ /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
+ err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
+ if (err)
+ goto out;
+
seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
show_mnt_opts(m, mnt);
@@ -2483,11 +2480,43 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
__mnt_make_longterm(mnt);
new_ns->root = mnt;
list_add(&new_ns->list, &new_ns->root->mnt_list);
+ } else {
+ mntput(mnt);
}
return new_ns;
}
EXPORT_SYMBOL(create_mnt_ns);
+struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
+{
+ struct mnt_namespace *ns;
+ struct super_block *s;
+ struct path path;
+ int err;
+
+ ns = create_mnt_ns(mnt);
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
+
+ err = vfs_path_lookup(mnt->mnt_root, mnt,
+ name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+
+ put_mnt_ns(ns);
+
+ if (err)
+ return ERR_PTR(err);
+
+ /* trade a vfsmount reference for active sb one */
+ s = path.mnt->mnt_sb;
+ atomic_inc(&s->s_active);
+ mntput(path.mnt);
+ /* lock the sucker */
+ down_write(&s->s_umount);
+ /* ... and return the root of (sub)tree on it */
+ return path.dentry;
+}
+EXPORT_SYMBOL(mount_subtree);
+
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
@@ -2744,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt)
}
}
EXPORT_SYMBOL(kern_unmount);
+
+bool our_mnt(struct vfsmount *mnt)
+{
+ return check_mnt(mnt);
+}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b238d95ac48..ac289909814 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
res = NULL;
goto out;
/* This turned out not to be a regular file */
+ case -EISDIR:
case -ENOTDIR:
goto no_open;
case -ELOOP:
if (!(nd->intent.open.flags & O_NOFOLLOW))
goto no_open;
- /* case -EISDIR: */
/* case -EINVAL: */
default:
res = ERR_CAST(inode);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 0a1f8312b4d..eca56d4b39c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -40,48 +40,8 @@
#define NFSDBG_FACILITY NFSDBG_FILE
-static int nfs_file_open(struct inode *, struct file *);
-static int nfs_file_release(struct inode *, struct file *);
-static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
-static int nfs_file_mmap(struct file *, struct vm_area_struct *);
-static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t count, unsigned int flags);
-static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
- struct file *filp, loff_t *ppos,
- size_t count, unsigned int flags);
-static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-static int nfs_file_flush(struct file *, fl_owner_t id);
-static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync);
-static int nfs_check_flags(int flags);
-static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
-static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
-static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
-
static const struct vm_operations_struct nfs_file_vm_ops;
-const struct file_operations nfs_file_operations = {
- .llseek = nfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = nfs_file_read,
- .aio_write = nfs_file_write,
- .mmap = nfs_file_mmap,
- .open = nfs_file_open,
- .flush = nfs_file_flush,
- .release = nfs_file_release,
- .fsync = nfs_file_fsync,
- .lock = nfs_lock,
- .flock = nfs_flock,
- .splice_read = nfs_file_splice_read,
- .splice_write = nfs_file_splice_write,
- .check_flags = nfs_check_flags,
- .setlease = nfs_setlease,
-};
-
const struct inode_operations nfs_file_inode_operations = {
.permission = nfs_permission,
.getattr = nfs_getattr,
@@ -886,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
file->f_path.dentry->d_name.name, arg);
return -EINVAL;
}
+
+const struct file_operations nfs_file_operations = {
+ .llseek = nfs_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = nfs_file_read,
+ .aio_write = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs_file_open,
+ .flush = nfs_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = nfs_file_splice_read,
+ .splice_write = nfs_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = nfs_setlease,
+};
+
+#ifdef CONFIG_NFS_V4
+static int
+nfs4_file_open(struct inode *inode, struct file *filp)
+{
+ /*
+ * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
+ * this point, then something is very wrong
+ */
+ dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
+ return -ENOTDIR;
+}
+
+const struct file_operations nfs4_file_operations = {
+ .llseek = nfs_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = nfs_file_read,
+ .aio_write = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs4_file_open,
+ .flush = nfs_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = nfs_file_splice_read,
+ .splice_write = nfs_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = nfs_setlease,
+};
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c07a55aec83..50a15fa8cf9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
*/
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
if (S_ISREG(inode->i_mode)) {
- inode->i_fop = &nfs_file_operations;
+ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
inode->i_data.a_ops = &nfs_file_aops;
inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
} else if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c1a1bd8ddf1..3f4d95751d5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
struct list_head *head);
+extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
extern void nfs_readdata_release(struct nfs_read_data *rdata);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 85f1690ca08..d4bc9ed9174 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.dentry_ops = &nfs_dentry_operations,
.dir_inode_ops = &nfs3_dir_inode_operations,
.file_inode_ops = &nfs3_file_inode_operations,
+ .file_ops = &nfs_file_operations,
.getroot = nfs3_proc_get_root,
.getattr = nfs3_proc_getattr,
.setattr = nfs3_proc_setattr,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b60fddf606f..be2bbac1381 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2464,8 +2464,7 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
case -NFS4ERR_BADNAME:
return -ENOENT;
case -NFS4ERR_MOVED:
- err = nfs4_get_referral(dir, name, fattr, fhandle);
- break;
+ return nfs4_get_referral(dir, name, fattr, fhandle);
case -NFS4ERR_WRONGSEC:
nfs_fixup_secinfo_attributes(fattr, fhandle);
}
@@ -6253,6 +6252,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.dentry_ops = &nfs4_dentry_operations,
.dir_inode_ops = &nfs4_dir_inode_operations,
.file_inode_ops = &nfs4_file_inode_operations,
+ .file_ops = &nfs4_file_operations,
.getroot = nfs4_proc_get_root,
.getattr = nfs4_proc_getattr,
.setattr = nfs4_proc_setattr,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index baf73536bc0..8e672a2b2d6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1260,6 +1260,25 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
+static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+{
+ struct nfs_pageio_descriptor pgio;
+
+ put_lseg(data->lseg);
+ data->lseg = NULL;
+ dprintk("pnfs write error = %d\n", data->pnfs_error);
+
+ nfs_pageio_init_read_mds(&pgio, data->inode);
+
+ while (!list_empty(&data->pages)) {
+ struct nfs_page *req = nfs_list_entry(data->pages.next);
+
+ nfs_list_remove_request(req);
+ nfs_pageio_add_request(&pgio, req);
+ }
+ nfs_pageio_complete(&pgio);
+}
+
/*
* Called by non rpc-based layout drivers
*/
@@ -1268,11 +1287,8 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
if (likely(!data->pnfs_error)) {
__nfs4_read_done_cb(data);
data->mds_ops->rpc_call_done(&data->task, data);
- } else {
- put_lseg(data->lseg);
- data->lseg = NULL;
- dprintk("pnfs write error = %d\n", data->pnfs_error);
- }
+ } else
+ pnfs_ld_handle_read_error(data);
data->mds_ops->rpc_release(data);
}
EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ac40b8535d7..f48125da198 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.dentry_ops = &nfs_dentry_operations,
.dir_inode_ops = &nfs_dir_inode_operations,
.file_inode_ops = &nfs_file_inode_operations,
+ .file_ops = &nfs_file_operations,
.getroot = nfs_proc_get_root,
.getattr = nfs_proc_getattr,
.setattr = nfs_proc_setattr,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8b48ec63f72..cfa175c223d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -109,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
}
}
-static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
struct inode *inode)
{
nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
@@ -534,23 +534,13 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
static void nfs_readpage_release_full(void *calldata)
{
struct nfs_read_data *data = calldata;
- struct nfs_pageio_descriptor pgio;
- if (data->pnfs_error) {
- nfs_pageio_init_read_mds(&pgio, data->inode);
- pgio.pg_recoalesce = 1;
- }
while (!list_empty(&data->pages)) {
struct nfs_page *req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- if (!data->pnfs_error)
- nfs_readpage_release(req);
- else
- nfs_pageio_add_request(&pgio, req);
+ nfs_readpage_release(req);
}
- if (data->pnfs_error)
- nfs_pageio_complete(&pgio);
nfs_readdata_release(calldata);
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 480b3b6bf71..134777406ee 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2787,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void)
static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
const char *export_path)
{
- struct mnt_namespace *ns_private;
- struct super_block *s;
struct dentry *dentry;
- struct path path;
- int ret;
-
- ns_private = create_mnt_ns(root_mnt);
- ret = PTR_ERR(ns_private);
- if (IS_ERR(ns_private))
- goto out_mntput;
-
- ret = nfs_referral_loop_protect();
- if (ret != 0)
- goto out_put_mnt_ns;
+ int ret = nfs_referral_loop_protect();
- ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
- export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+ if (ret) {
+ mntput(root_mnt);
+ return ERR_PTR(ret);
+ }
+ dentry = mount_subtree(root_mnt, export_path);
nfs_referral_loop_unprotect();
- put_mnt_ns(ns_private);
-
- if (ret != 0)
- goto out_err;
-
- s = path.mnt->mnt_sb;
- atomic_inc(&s->s_active);
- dentry = dget(path.dentry);
- path_put(&path);
- down_write(&s->s_umount);
return dentry;
-out_put_mnt_ns:
- put_mnt_ns(ns_private);
-out_mntput:
- mntput(root_mnt);
-out_err:
- return ERR_PTR(ret);
}
static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ed553c60de8..3165aebb43c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_commit;
}
dquot_free_space_nodirty(inode,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c1efe939c77..78b68af3b0e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
}
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ /*
+ * Unlock the page and cycle ip_alloc_sem so that we don't
+ * busyloop waiting for ip_alloc_sem to unlock
+ */
ret = AOP_TRUNCATED_PAGE;
+ unlock_page(page);
+ unlock = 0;
+ down_read(&oi->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
goto out_inode_unlock;
}
@@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
int level;
+ wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
if (ocfs2_iocb_is_sem_locked(iocb))
ocfs2_iocb_clear_sem_locked(iocb);
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+
+ if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+ waitqueue_active(wq)) {
+ wake_up_all(wq);
+ }
+ }
+
ocfs2_iocb_clear_rw_locked(iocb);
level = ocfs2_iocb_rw_locked_level(iocb);
@@ -863,6 +881,12 @@ struct ocfs2_write_ctxt {
struct page *w_target_page;
/*
+ * w_target_locked is used for page_mkwrite path indicating no unlocking
+ * against w_target_page in ocfs2_write_end_nolock.
+ */
+ unsigned int w_target_locked:1;
+
+ /*
* ocfs2_write_end() uses this to know what the real range to
* write in the target should be.
*/
@@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
{
+ int i;
+
+ /*
+ * w_target_locked is only set to true in the page_mkwrite() case.
+ * The intent is to allow us to lock the target page from write_begin()
+ * to write_end(). The caller must hold a ref on w_target_page.
+ */
+ if (wc->w_target_locked) {
+ BUG_ON(!wc->w_target_page);
+ for (i = 0; i < wc->w_num_pages; i++) {
+ if (wc->w_target_page == wc->w_pages[i]) {
+ wc->w_pages[i] = NULL;
+ break;
+ }
+ }
+ mark_page_accessed(wc->w_target_page);
+ page_cache_release(wc->w_target_page);
+ }
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
brelse(wc->w_di_bh);
@@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
*/
lock_page(mmap_page);
+ /* Exit and let the caller retry */
if (mmap_page->mapping != mapping) {
+ WARN_ON(mmap_page->mapping);
unlock_page(mmap_page);
- /*
- * Sanity check - the locking in
- * ocfs2_pagemkwrite() should ensure
- * that this code doesn't trigger.
- */
- ret = -EINVAL;
- mlog_errno(ret);
+ ret = -EAGAIN;
goto out;
}
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
+ wc->w_target_locked = true;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
wc->w_target_page = wc->w_pages[i];
}
out:
+ if (ret)
+ wc->w_target_locked = false;
return ret;
}
@@ -1817,11 +1858,23 @@ try_again:
*/
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
- if (ret) {
+ if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out_quota;
}
+ /*
+ * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+ * the target page. In this case, we exit with no error and no target
+ * page. This will trigger the caller, page_mkwrite(), to re-try
+ * the operation.
+ */
+ if (ret == -EAGAIN) {
+ BUG_ON(wc->w_target_page);
+ ret = 0;
+ goto out_quota;
+ }
+
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
len);
if (ret) {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 75cf3ad987a..ffb2da370a9 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
OCFS2_IOCB_SEM,
+ OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
#define ocfs2_iocb_is_sem_locked(iocb) \
test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+ set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+ clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+
+#define OCFS2_IOEND_WQ_HASH_SZ 37
+#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
+ OCFS2_IOEND_WQ_HASH_SZ])
+extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9a3e6bbff27..a4e855e3690 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,6 +216,7 @@ struct o2hb_region {
struct list_head hr_all_item;
unsigned hr_unclean_stop:1,
+ hr_aborted_start:1,
hr_item_pinned:1,
hr_item_dropped:1;
@@ -254,6 +255,10 @@ struct o2hb_region {
* a more complete api that doesn't lead to this sort of fragility. */
atomic_t hr_steady_iterations;
+ /* terminate o2hb thread if it does not reach steady state
+ * (hr_steady_iterations == 0) within hr_unsteady_iterations */
+ atomic_t hr_unsteady_iterations;
+
char hr_dev_name[BDEVNAME_SIZE];
unsigned int hr_timeout_ms;
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work)
static void o2hb_arm_write_timeout(struct o2hb_region *reg)
{
+ /* Arm writeout only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
+ return;
+
mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
O2HB_MAX_WRITE_TIMEOUT_MS);
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
return read == computed;
}
-/* We want to make sure that nobody is heartbeating on top of us --
- * this will help detect an invalid configuration. */
-static void o2hb_check_last_timestamp(struct o2hb_region *reg)
+/*
+ * Compare the slot data with what we wrote in the last iteration.
+ * If the match fails, print an appropriate error message. This is to
+ * detect errors like... another node hearting on the same slot,
+ * flaky device that is losing writes, etc.
+ * Returns 1 if check succeeds, 0 otherwise.
+ */
+static int o2hb_check_own_slot(struct o2hb_region *reg)
{
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
slot = &reg->hr_slots[o2nm_this_node()];
/* Don't check on our 1st timestamp */
if (!slot->ds_last_time)
- return;
+ return 0;
hb_block = slot->ds_raw_block;
if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
hb_block->hb_node == slot->ds_node_num)
- return;
+ return 1;
#define ERRSTR1 "Another node is heartbeating on device"
#define ERRSTR2 "Heartbeat generation mismatch on device"
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
(unsigned long long)slot->ds_last_time, hb_block->hb_node,
(unsigned long long)le64_to_cpu(hb_block->hb_generation),
(unsigned long long)le64_to_cpu(hb_block->hb_seq));
+
+ return 0;
}
static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
o2nm_node_put(node);
}
-static void o2hb_set_quorum_device(struct o2hb_region *reg,
- struct o2hb_disk_slot *slot)
+static void o2hb_set_quorum_device(struct o2hb_region *reg)
{
- assert_spin_locked(&o2hb_live_lock);
-
if (!o2hb_global_heartbeat_active())
return;
- if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ /* Prevent race with o2hb_heartbeat_group_drop_item() */
+ if (kthread_should_stop())
+ return;
+
+ /* Tag region as quorum only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
return;
+ spin_lock(&o2hb_live_lock);
+
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ goto unlock;
+
/*
* A region can be added to the quorum only when it sees all
* live nodes heartbeat on it. In other words, the region has been
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
*/
if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
sizeof(o2hb_live_node_bitmap)))
- return;
-
- if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
- return;
+ goto unlock;
- printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
- config_item_name(&reg->hr_item));
+ printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
o2hb_region_unpin(NULL);
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
static int o2hb_check_slot(struct o2hb_region *reg,
@@ -925,8 +947,6 @@ fire_callbacks:
slot->ds_equal_samples = 0;
}
out:
- o2hb_set_quorum_device(reg, slot);
-
spin_unlock(&o2hb_live_lock);
o2hb_run_event_list(&event);
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes,
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
- int i, ret, highest_node, change = 0;
+ int i, ret, highest_node;
+ int membership_change = 0, own_slot_ok = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct o2hb_bio_wait_ctxt write_wc;
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
sizeof(configured_nodes));
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
/*
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
if (highest_node >= O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
- return -EINVAL;
+ mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
+ ret = -EINVAL;
+ goto bail;
}
/* No sense in reading the slots of nodes that don't exist
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
ret = o2hb_read_slots(reg, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
/* With an up to date view of the slots, we can check that no
* other node has been improperly configured to heartbeat in
* our slot. */
- o2hb_check_last_timestamp(reg);
+ own_slot_ok = o2hb_check_own_slot(reg);
/* fill in the proper info for our next heartbeat */
o2hb_prepare_block(reg, reg->hr_generation);
- /* And fire off the write. Note that we don't wait on this I/O
- * until later. */
ret = o2hb_issue_node_write(reg, &write_wc);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
i = -1;
while((i = find_next_bit(configured_nodes,
O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
- change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+ membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
}
/*
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* disk */
mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
write_wc.wc_error, reg->hr_dev_name);
- return write_wc.wc_error;
+ ret = write_wc.wc_error;
+ goto bail;
}
- o2hb_arm_write_timeout(reg);
+ /* Skip disarming the timeout if own slot has stale/bad data */
+ if (own_slot_ok) {
+ o2hb_set_quorum_device(reg);
+ o2hb_arm_write_timeout(reg);
+ }
+bail:
/* let the person who launched us know when things are steady */
- if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
- if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (!ret && own_slot_ok && !membership_change) {
+ if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ wake_up(&o2hb_steady_queue);
+ }
+ }
+
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
+ printk(KERN_NOTICE "o2hb: Unable to stabilize "
+ "heartbeart on region %s (%s)\n",
+ config_item_name(&reg->hr_item),
+ reg->hr_dev_name);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
+ ret = -EIO;
+ }
}
- return 0;
+ return ret;
}
/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data)
/* Pin node */
o2nm_depend_this_node();
- while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+ while (!kthread_should_stop() &&
+ !reg->hr_unclean_stop && !reg->hr_aborted_start) {
/* We track the time spent inside
* o2hb_do_disk_heartbeat so that we avoid more than
* hr_timeout_ms between disk writes. On busy systems
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data)
* likely to time itself out. */
do_gettimeofday(&before_hb);
- i = 0;
- do {
- ret = o2hb_do_disk_heartbeat(reg);
- } while (ret && ++i < 2);
+ ret = o2hb_do_disk_heartbeat(reg);
do_gettimeofday(&after_hb);
elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data)
after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
elapsed_msec);
- if (elapsed_msec < reg->hr_timeout_ms) {
+ if (!kthread_should_stop() &&
+ elapsed_msec < reg->hr_timeout_ms) {
/* the kthread api has blocked signals for us so no
* need to record the return value. */
msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data)
* to timeout on this region when we could just as easily
* write a clear generation - thus indicating to them that
* this node has left this region.
- *
- * XXX: Should we skip this on unclean_stop? */
- o2hb_prepare_block(reg, 0);
- ret = o2hb_issue_node_write(reg, &write_wc);
- if (ret == 0) {
- o2hb_wait_on_io(reg, &write_wc);
- } else {
- mlog_errno(ret);
+ */
+ if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
+ o2hb_prepare_block(reg, 0);
+ ret = o2hb_issue_node_write(reg, &write_wc);
+ if (ret == 0)
+ o2hb_wait_on_io(reg, &write_wc);
+ else
+ mlog_errno(ret);
}
/* Unpin node */
o2nm_undepend_this_node();
- mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+ mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
return 0;
}
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
struct o2hb_debug_buf *db = inode->i_private;
struct o2hb_region *reg;
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long lts;
char *buf = NULL;
int i = -1;
int out = 0;
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
reg = (struct o2hb_region *)db->db_data;
- out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
- jiffies_to_msecs(jiffies -
- reg->hr_last_timeout_start));
+ lts = reg->hr_last_timeout_start;
+ /* If 0, it has never been set before */
+ if (lts)
+ lts = jiffies_to_msecs(jiffies - lts);
+ out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
goto done;
case O2HB_DB_TYPE_REGION_PINNED:
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
+ mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+
if (reg->hr_tmp_block)
kfree(reg->hr_tmp_block);
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
live_threshold <<= 1;
spin_unlock(&o2hb_live_lock);
}
- atomic_set(&reg->hr_steady_iterations, live_threshold + 1);
+ ++live_threshold;
+ atomic_set(&reg->hr_steady_iterations, live_threshold);
+ /* unsteady_iterations is double the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = wait_event_interruptible(o2hb_steady_queue,
atomic_read(&reg->hr_steady_iterations) == 0);
if (ret) {
- /* We got interrupted (hello ptrace!). Clean up */
- spin_lock(&o2hb_live_lock);
- hb_task = reg->hr_task;
- reg->hr_task = NULL;
- spin_unlock(&o2hb_live_lock);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
+ }
- if (hb_task)
- kthread_stop(hb_task);
+ if (reg->hr_aborted_start) {
+ ret = -EIO;
goto out;
}
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = -EIO;
if (hb_task && o2hb_global_heartbeat_active())
- printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
- config_item_name(&reg->hr_item));
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
out:
if (filp)
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
/* stop the thread when the user removes the region dir */
spin_lock(&o2hb_live_lock);
- if (o2hb_global_heartbeat_active()) {
- clear_bit(reg->hr_region_num, o2hb_region_bitmap);
- clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
- if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
- quorum_region = 1;
- clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
- }
hb_task = reg->hr_task;
reg->hr_task = NULL;
reg->hr_item_dropped = 1;
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
if (hb_task)
kthread_stop(hb_task);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ quorum_region = 1;
+ clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+ ((atomic_read(&reg->hr_steady_iterations) == 0) ?
+ "stopped" : "start aborted"), config_item_name(item),
+ reg->hr_dev_name);
+ }
+
/*
* If we're racing a dev_write(), we need to wake them. They will
* check reg->hr_task
*/
if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ reg->hr_aborted_start = 1;
atomic_set(&reg->hr_steady_iterations, 0);
wake_up(&o2hb_steady_queue);
}
- if (o2hb_global_heartbeat_active())
- printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
- config_item_name(&reg->hr_item));
-
config_item_put(item);
if (!o2hb_global_heartbeat_active() || !quorum_region)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 3a5835904b3..dc45deb19e6 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -47,6 +47,7 @@
#define SC_DEBUG_NAME "sock_containers"
#define NST_DEBUG_NAME "send_tracking"
#define STATS_DEBUG_NAME "stats"
+#define NODES_DEBUG_NAME "connected_nodes"
#define SHOW_SOCK_CONTAINERS 0
#define SHOW_SOCK_STATS 1
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry;
static struct dentry *sc_dentry;
static struct dentry *nst_dentry;
static struct dentry *stats_dentry;
+static struct dentry *nodes_dentry;
static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = {
.release = sc_fop_release,
};
-int o2net_debugfs_init(void)
+static int o2net_fill_bitmap(char *buf, int len)
{
- o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
- if (!o2net_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int i = -1, out = 0;
- nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &nst_seq_fops);
- if (!nst_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ o2net_fill_node_map(map, sizeof(map));
- sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &sc_seq_fops);
- if (!sc_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += snprintf(buf + out, PAGE_SIZE - out, "\n");
- stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &stats_seq_fops);
- if (!stats_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ return out;
+}
+
+static int nodes_fop_open(struct inode *inode, struct file *file)
+{
+ char *buf;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
+
+ file->private_data = buf;
return 0;
-bail:
- debugfs_remove(stats_dentry);
- debugfs_remove(sc_dentry);
- debugfs_remove(nst_dentry);
- debugfs_remove(o2net_dentry);
- return -ENOMEM;
}
+static int o2net_debug_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static ssize_t o2net_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
+}
+
+static const struct file_operations nodes_fops = {
+ .open = nodes_fop_open,
+ .release = o2net_debug_release,
+ .read = o2net_debug_read,
+ .llseek = generic_file_llseek,
+};
+
void o2net_debugfs_exit(void)
{
+ debugfs_remove(nodes_dentry);
debugfs_remove(stats_dentry);
debugfs_remove(sc_dentry);
debugfs_remove(nst_dentry);
debugfs_remove(o2net_dentry);
}
+int o2net_debugfs_init(void)
+{
+ mode_t mode = S_IFREG|S_IRUSR;
+
+ o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+ if (o2net_dentry)
+ nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nst_seq_fops);
+ if (nst_dentry)
+ sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &sc_seq_fops);
+ if (sc_dentry)
+ stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &stats_seq_fops);
+ if (stats_dentry)
+ nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nodes_fops);
+ if (nodes_dentry)
+ return 0;
+
+ o2net_debugfs_exit();
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+}
+
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index ad7d0c155de..044e7b58d31 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -546,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
if (was_valid && !valid) {
- printk(KERN_NOTICE "o2net: no longer connected to "
+ printk(KERN_NOTICE "o2net: No longer connected to "
SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
o2net_complete_nodes_nsw(nn);
}
@@ -556,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
cancel_delayed_work(&nn->nn_connect_expired);
printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
o2nm_this_node() > sc->sc_node->nd_num ?
- "connected to" : "accepted connection from",
+ "Connected to" : "Accepted connection from",
SC_NODEF_ARGS(sc));
}
@@ -644,7 +644,7 @@ static void o2net_state_change(struct sock *sk)
o2net_sc_queue_work(sc, &sc->sc_connect_work);
break;
default:
- printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+ printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
" shutdown, state %d\n",
SC_NODEF_ARGS(sc), sk->sk_state);
o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
@@ -1035,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
return ret;
}
+/* Get a map of all nodes to which this node is currently connected to */
+void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+{
+ struct o2net_sock_container *sc;
+ int node, ret;
+
+ BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+ memset(map, 0, bytes);
+ for (node = 0; node < O2NM_MAX_NODES; ++node) {
+ o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+ if (!ret) {
+ set_bit(node, map);
+ sc_put(sc);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(o2net_fill_node_map);
+
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
size_t caller_veclen, u8 target_node, int *status)
{
@@ -1285,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
- mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
- "version %llu but %llu is required, disconnecting\n",
- SC_NODEF_ARGS(sc),
- (unsigned long long)be64_to_cpu(hand->protocol_version),
- O2NET_PROTOCOL_VERSION);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
+ "protocol version %llu but %llu is required. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ (unsigned long long)be64_to_cpu(hand->protocol_version),
+ O2NET_PROTOCOL_VERSION);
/* don't bother reconnecting if its the wrong version. */
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1303,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
*/
if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
o2net_idle_timeout()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_idle_timeout_ms),
- o2net_idle_timeout());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
+ "idle timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_idle_timeout_ms),
+ o2net_idle_timeout());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
o2net_keepalive_delay()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_keepalive_delay_ms),
- o2net_keepalive_delay());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
+ "delay of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_keepalive_delay_ms),
+ o2net_keepalive_delay());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
O2HB_MAX_WRITE_TIMEOUT_MS) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
- O2HB_MAX_WRITE_TIMEOUT_MS);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
+ "timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+ O2HB_MAX_WRITE_TIMEOUT_MS);
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
@@ -1540,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data)
{
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-
#ifdef CONFIG_DEBUG_FS
- ktime_t now = ktime_get();
+ unsigned long msecs = ktime_to_ms(ktime_get()) -
+ ktime_to_ms(sc->sc_tv_timer);
+#else
+ unsigned long msecs = o2net_idle_timeout();
#endif
- printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
- "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
- o2net_idle_timeout() / 1000,
- o2net_idle_timeout() % 1000);
-
-#ifdef CONFIG_DEBUG_FS
- mlog(ML_NOTICE, "Here are some times that might help debug the "
- "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
- "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
- (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
- (long long)ktime_to_us(sc->sc_tv_data_ready),
- (long long)ktime_to_us(sc->sc_tv_advance_start),
- (long long)ktime_to_us(sc->sc_tv_advance_stop),
- sc->sc_msg_key, sc->sc_msg_type,
- (long long)ktime_to_us(sc->sc_tv_func_start),
- (long long)ktime_to_us(sc->sc_tv_func_stop));
-#endif
+ printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
+ "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
+ msecs / 1000, msecs % 1000);
/*
* Initialize the nn_timeout so that the next connection attempt
@@ -1694,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work)
out:
if (ret) {
- mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
- "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+ printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
+ " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
/* 0 err so that another will be queued and attempted
* from set_nn_state */
if (sc)
@@ -1718,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work)
spin_lock(&nn->nn_lock);
if (!nn->nn_sc_valid) {
- mlog(ML_ERROR, "no connection established with node %u after "
- "%u.%u seconds, giving up and returning errors.\n",
+ printk(KERN_NOTICE "o2net: No connection established with "
+ "node %u after %u.%u seconds, giving up.\n",
o2net_num_from_nn(nn),
o2net_idle_timeout() / 1000,
o2net_idle_timeout() % 1000);
@@ -1862,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock)
node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
if (node == NULL) {
- mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
+ "node at %pI4:%d\n", &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
if (o2nm_this_node() >= node->nd_num) {
local_node = o2nm_get_node_by_num(o2nm_this_node());
- mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
- "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
- local_node->nd_name, local_node->nd_num,
- &(local_node->nd_ipv4_address),
- ntohs(local_node->nd_ipv4_port),
- node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
+ "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
+ "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
+ &(local_node->nd_ipv4_address),
+ ntohs(local_node->nd_ipv4_port), node->nd_name,
+ node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1901,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock)
ret = 0;
spin_unlock(&nn->nn_lock);
if (ret) {
- mlog(ML_NOTICE, "attempt to connect from node '%s' at "
- "%pI4:%d but it already has an open connection\n",
- node->nd_name, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
+ "at %pI4:%d but it already has an open connection\n",
+ node->nd_name, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
goto out;
}
@@ -1984,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0) {
- mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+ printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
goto out;
}
@@ -2001,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
sock->sk->sk_reuse = 1;
ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
if (ret < 0) {
- mlog(ML_ERROR, "unable to bind socket at %pI4:%u, "
- "ret=%d\n", &addr, ntohs(port), ret);
+ printk(KERN_ERR "o2net: Error %d while binding socket at "
+ "%pI4:%u\n", ret, &addr, ntohs(port));
goto out;
}
ret = sock->ops->listen(sock, 64);
- if (ret < 0) {
- mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n",
- &addr, ntohs(port), ret);
- }
+ if (ret < 0)
+ printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
+ ret, &addr, ntohs(port));
out:
if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index fd6179eb26d..5bada2a69b5 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
struct list_head *unreg_list);
void o2net_unregister_handler_list(struct list_head *list);
+void o2net_fill_node_map(unsigned long *map, unsigned bytes);
+
struct o2nm_node;
int o2net_register_hb_callbacks(void);
void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e2878b5895f..8fe4e2892ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
if (pde)
le16_add_cpu(&pde->rec_len,
le16_to_cpu(de->rec_len));
- else
- de->inode = 0;
+ de->inode = 0;
dir->i_version++;
ocfs2_journal_dirty(handle, bh);
goto bail;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d602abb51b6..a5952ceecba 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
kref_get(&res->refs);
}
void dlm_lockres_put(struct dlm_lock_resource *res);
-void __dlm_unhash_lockres(struct dlm_lock_resource *res);
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
const char *name,
unsigned int len,
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int namelen);
-#define dlm_lockres_set_refmap_bit(bit,res) \
- __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
-#define dlm_lockres_clear_refmap_bit(bit,res) \
- __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
-static inline void __dlm_lockres_set_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: setting bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- set_bit(bit, res->refmap);
-}
-
-static inline void __dlm_lockres_clear_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- clear_bit(bit, res->refmap);
-}
-
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line);
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line);
-#define dlm_lockres_drop_inflight_ref(d,r) \
- __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref_new(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ed6b95dcf9..92f2ead0fab 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
-void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- if (!hlist_unhashed(&lockres->hash_node)) {
- hlist_del_init(&lockres->hash_node);
- dlm_lockres_put(lockres);
- }
+ if (hlist_unhashed(&res->hash_node))
+ return;
+
+ mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+ hlist_del_init(&res->hash_node);
+ dlm_lockres_put(res);
}
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res)
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
struct hlist_head *bucket;
struct qstr *q;
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
dlm_lockres_get(res);
hlist_add_head(&res->hash_node, bucket);
+
+ mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -539,17 +544,17 @@ again:
static void __dlm_print_nodes(struct dlm_ctxt *dlm)
{
- int node = -1;
+ int node = -1, num = 0;
assert_spin_locked(&dlm->spinlock);
- printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
-
+ printk("( ");
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
printk("%d ", node);
+ ++num;
}
- printk("\n");
+ printk(") %u nodes\n", num);
}
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
node = exit_msg->node_idx;
- printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
-
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
clear_bit(node, dlm->exit_domain_map);
+ printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
+ printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
dlm_force_free_mles(dlm);
dlm_complete_dlm_shutdown(dlm);
}
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
+ printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
assert->node_idx, dlm->name);
__dlm_print_nodes(dlm);
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
bail:
spin_lock(&dlm->spinlock);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- if (!status)
+ if (!status) {
+ printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
__dlm_print_nodes(dlm);
+ }
spin_unlock(&dlm->spinlock);
if (ctxt) {
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
goto leave;
}
- if (!o2hb_check_local_node_heartbeating()) {
- mlog(ML_ERROR, "the local node has not been configured, or is "
- "not heartbeating\n");
- ret = -EPROTO;
- goto leave;
- }
-
mlog(0, "register called for domain \"%s\"\n", domain);
retry:
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 8d39e0fd66f..975810b9849 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
kick_thread = 1;
}
}
- /* reduce the inflight count, this may result in the lockres
- * being purged below during calc_usage */
- if (lock->ml.node == dlm->node_num)
- dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
lock->ml.type, res->lockname.len,
res->lockname.name, flags);
+ /*
+ * Wait if resource is getting recovered, remastered, etc.
+ * If the resource was remastered and new owner is self, then exit.
+ */
spin_lock(&res->spinlock);
-
- /* will exit this call with spinlock held */
__dlm_wait_on_lockres(res);
+ if (res->owner == dlm->node_num) {
+ spin_unlock(&res->spinlock);
+ return DLM_RECOVERING;
+ }
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* add lock to local (secondary) queue */
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
sizeof(create), res->owner, &status);
if (tmpret >= 0) {
- // successfully sent and received
- ret = status; // this is already a dlm_status
+ ret = status;
if (ret == DLM_REJECTED) {
- mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres "
- "no longer owned by %u. that node is coming back "
- "up currently.\n", dlm->name, create.namelen,
+ mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
+ "owned by node %u. That node is coming back up "
+ "currently.\n", dlm->name, create.namelen,
create.name, res->owner);
dlm_print_one_lock_resource(res);
BUG();
}
} else {
- mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
- "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
- res->owner);
- if (dlm_is_host_down(tmpret)) {
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
+ "node %u\n", dlm->name, create.namelen, create.name,
+ tmpret, res->owner);
+ if (dlm_is_host_down(tmpret))
ret = DLM_RECOVERING;
- mlog(0, "node %u died so returning DLM_RECOVERING "
- "from lock message!\n", res->owner);
- } else {
+ else
ret = dlm_err_to_dlm_status(tmpret);
- }
}
return ret;
@@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
/* zero memory only if kernel-allocated */
lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
if (!lksb) {
- kfree(lock);
+ kmem_cache_free(dlm_lock_cache, lock);
return NULL;
}
kernel_allocated = 1;
@@ -718,18 +716,10 @@ retry_lock:
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
- mlog(0, "retrying lock with migration/"
- "recovery/in progress\n");
msleep(100);
- /* no waiting for dlm_reco_thread */
if (recovery) {
if (status != DLM_RECOVERING)
goto retry_lock;
-
- mlog(0, "%s: got RECOVERING "
- "for $RECOVERY lock, master "
- "was %u\n", dlm->name,
- res->owner);
/* wait to see the node go down, then
* drop down and allow the lockres to
* get cleaned up. need to remaster. */
@@ -741,6 +731,14 @@ retry_lock:
}
}
+ /* Inflight taken in dlm_get_lock_resource() is dropped here */
+ spin_lock(&res->spinlock);
+ dlm_lockres_drop_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
+
+ dlm_lockres_calc_usage(dlm, res);
+ dlm_kick_thread(dlm, res);
+
if (status != DLM_NORMAL) {
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 11eefb8c12e..005261c333b 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -631,39 +631,54 @@ error:
return NULL;
}
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
{
- if (!new_lockres)
- assert_spin_locked(&res->spinlock);
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ set_bit(bit, res->refmap);
+}
+
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
+{
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ clear_bit(bit, res->refmap);
+}
+
+
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
- if (!test_bit(dlm->node_num, res->refmap)) {
- BUG_ON(res->inflight_locks != 0);
- dlm_lockres_set_refmap_bit(dlm->node_num, res);
- }
res->inflight_locks++;
- mlog(0, "%s:%.*s: inflight++: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
+
+ mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
}
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
BUG_ON(res->inflight_locks == 0);
+
res->inflight_locks--;
- mlog(0, "%s:%.*s: inflight--: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
- if (res->inflight_locks == 0)
- dlm_lockres_clear_refmap_bit(dlm->node_num, res);
+
+ mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
+
wake_up(&res->wq);
}
@@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
unsigned int hash;
int tries = 0;
int bit, wait_on_recovery = 0;
- int drop_inflight_if_nonlocal = 0;
BUG_ON(!lockid);
@@ -709,36 +723,33 @@ lookup:
spin_lock(&dlm->spinlock);
tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
if (tmpres) {
- int dropping_ref = 0;
-
spin_unlock(&dlm->spinlock);
-
spin_lock(&tmpres->spinlock);
- /* We wait for the other thread that is mastering the resource */
+ /* Wait on the thread that is mastering the resource */
if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
__dlm_wait_on_lockres(tmpres);
BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
}
- if (tmpres->owner == dlm->node_num) {
- BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
- dlm_lockres_grab_inflight_ref(dlm, tmpres);
- } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
- dropping_ref = 1;
- spin_unlock(&tmpres->spinlock);
-
- /* wait until done messaging the master, drop our ref to allow
- * the lockres to be purged, start over. */
- if (dropping_ref) {
- spin_lock(&tmpres->spinlock);
- __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
+ /* Wait on the resource purge to complete before continuing */
+ if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
+ BUG_ON(tmpres->owner == dlm->node_num);
+ __dlm_wait_on_lockres_flags(tmpres,
+ DLM_LOCK_RES_DROPPING_REF);
spin_unlock(&tmpres->spinlock);
dlm_lockres_put(tmpres);
tmpres = NULL;
goto lookup;
}
- mlog(0, "found in hash!\n");
+ /* Grab inflight ref to pin the resource */
+ dlm_lockres_grab_inflight_ref(dlm, tmpres);
+
+ spin_unlock(&tmpres->spinlock);
if (res)
dlm_lockres_put(res);
res = tmpres;
@@ -829,8 +840,8 @@ lookup:
* but they might own this lockres. wait on them. */
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
}
@@ -843,12 +854,11 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
- /* since this lockres is new it doesn't not require the spinlock */
- dlm_lockres_grab_inflight_ref_new(dlm, res);
- /* if this node does not become the master make sure to drop
- * this inflight reference below */
- drop_inflight_if_nonlocal = 1;
+ /* Grab inflight ref to pin the resource */
+ spin_lock(&res->spinlock);
+ dlm_lockres_grab_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
@@ -864,8 +874,8 @@ redo_request:
* dlm spinlock would be detectable be a change on the mle,
* so we only need to clear out the recovery map once. */
if (dlm_is_recovery_lock(lockid, namelen)) {
- mlog(ML_NOTICE, "%s: recovery map is not empty, but "
- "must master $RECOVERY lock now\n", dlm->name);
+ mlog(0, "%s: Recovery map is not empty, but must "
+ "master $RECOVERY lock now\n", dlm->name);
if (!dlm_pre_master_reco_lockres(dlm, res))
wait_on_recovery = 0;
else {
@@ -883,8 +893,8 @@ redo_request:
spin_lock(&dlm->spinlock);
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
} else
@@ -913,8 +923,8 @@ redo_request:
* yet, keep going until it does. this is how the
* master will know that asserts are needed back to
* the lower nodes. */
- mlog(0, "%s:%.*s: requests only up to %u but master "
- "is %u, keep going\n", dlm->name, namelen,
+ mlog(0, "%s: res %.*s, Requests only up to %u but "
+ "master is %u, keep going\n", dlm->name, namelen,
lockid, nodenum, mle->master);
}
}
@@ -924,13 +934,12 @@ wait:
ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
if (ret < 0) {
wait_on_recovery = 1;
- mlog(0, "%s:%.*s: node map changed, redo the "
- "master request now, blocked=%d\n",
- dlm->name, res->lockname.len,
+ mlog(0, "%s: res %.*s, Node map changed, redo the master "
+ "request now, blocked=%d\n", dlm->name, res->lockname.len,
res->lockname.name, blocked);
if (++tries > 20) {
- mlog(ML_ERROR, "%s:%.*s: spinning on "
- "dlm_wait_for_lock_mastery, blocked=%d\n",
+ mlog(ML_ERROR, "%s: res %.*s, Spinning on "
+ "dlm_wait_for_lock_mastery, blocked = %d\n",
dlm->name, res->lockname.len,
res->lockname.name, blocked);
dlm_print_one_lock_resource(res);
@@ -940,7 +949,8 @@ wait:
goto redo_request;
}
- mlog(0, "lockres mastered by %u\n", res->owner);
+ mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
+ res->lockname.name, res->owner);
/* make sure we never continue without this */
BUG_ON(res->owner == O2NM_MAX_NODES);
@@ -952,8 +962,6 @@ wait:
wake_waiters:
spin_lock(&res->spinlock);
- if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
- dlm_lockres_drop_inflight_ref(dlm, res);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -1426,9 +1434,7 @@ way_up_top:
}
if (res->owner == dlm->node_num) {
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name, request->node_idx);
- dlm_lockres_set_refmap_bit(request->node_idx, res);
+ dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
spin_unlock(&res->spinlock);
response = DLM_MASTER_RESP_YES;
if (mle)
@@ -1493,10 +1499,8 @@ way_up_top:
* go back and clean the mles on any
* other nodes */
dispatch_assert = 1;
- dlm_lockres_set_refmap_bit(request->node_idx, res);
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name,
- request->node_idx);
+ dlm_lockres_set_refmap_bit(dlm, res,
+ request->node_idx);
} else
response = DLM_MASTER_RESP_NO;
} else {
@@ -1702,7 +1706,7 @@ again:
"lockres, set the bit in the refmap\n",
namelen, lockname, to);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(to, res);
+ dlm_lockres_set_refmap_bit(dlm, res, to);
spin_unlock(&res->spinlock);
}
}
@@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
namelen = res->lockname.len;
BUG_ON(namelen > O2NM_MAX_NAME_LEN);
- mlog(0, "%s:%.*s: sending deref to %d\n",
- dlm->name, namelen, lockname, res->owner);
memset(&deref, 0, sizeof(deref));
deref.node_idx = dlm->node_num;
deref.namelen = namelen;
@@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
&deref, sizeof(deref), res->owner, &r);
if (ret < 0)
- mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
- "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
- res->owner);
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
+ dlm->name, namelen, lockname, ret, res->owner);
else if (r < 0) {
/* BAD. other node says I did not have a ref. */
- mlog(ML_ERROR,"while dropping ref on %s:%.*s "
- "(master=%u) got %d.\n", dlm->name, namelen,
- lockname, res->owner, r);
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
BUG();
}
@@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
else {
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
}
@@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
spin_unlock(&res->spinlock);
@@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
BUG_ON(!list_empty(&lock->bast_list));
BUG_ON(lock->ast_pending);
BUG_ON(lock->bast_pending);
- dlm_lockres_clear_refmap_bit(lock->ml.node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ lock->ml.node);
list_del_init(&lock->list);
dlm_lock_put(lock);
/* In a normal unlock, we would have added a
@@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: node %u had a ref to this "
"migrating lockres, clearing\n", dlm->name,
res->lockname.len, res->lockname.name, bit);
- dlm_lockres_clear_refmap_bit(bit, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, bit);
}
bit++;
}
@@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
&migrate, sizeof(migrate), nodenum,
&status);
if (ret < 0) {
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
- dlm->key, nodenum);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send "
+ "MIGRATE_REQUEST to node %u\n", dlm->name,
+ migrate.namelen, migrate.name, ret, nodenum);
if (!dlm_is_host_down(ret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", ret);
BUG();
@@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
dlm->name, res->lockname.len, res->lockname.name,
nodenum);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(nodenum, res);
+ dlm_lockres_set_refmap_bit(dlm, res, nodenum);
spin_unlock(&res->spinlock);
}
}
@@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* mastery reference here since old_master will briefly have
* a reference after the migration completes */
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(old_master, res);
+ dlm_lockres_set_refmap_bit(dlm, res, old_master);
spin_unlock(&res->spinlock);
mlog(0, "now time to do a migrate request to other nodes\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7efab6d28a2..01ebfd0bdad 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
}
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(ML_NOTICE, "%s: waiting %dms for notification of "
- "death of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_dead(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_dead(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
- "of death of node %u\n", dlm->name, node);
+ dlm_is_node_dead(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_dead(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(0, "%s: waiting %dms for notification of "
- "recovery of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_recovered(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_recovered(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(0, "%s: waiting indefinitely for notification "
- "of recovery of node %u\n", dlm->name, node);
+ dlm_is_node_recovered(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_recovered(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
/* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{
spin_lock(&dlm->spinlock);
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+ printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
+ dlm->name, dlm->reco.dead_node);
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
}
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
+ printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
wake_up(&dlm->reco.event);
}
+static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
+{
+ printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
+ "dead node %u in domain %s\n", dlm->reco.new_master,
+ (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
+ dlm->reco.dead_node, dlm->name);
+}
+
static int dlm_do_recovery(struct dlm_ctxt *dlm)
{
int status = 0;
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
mlog(0, "another node will master this recovery session.\n");
}
- mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
- dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master,
- dlm->node_num, dlm->reco.dead_node);
+
+ dlm_print_recovery_master(dlm);
/* it is safe to start everything back up here
* because all of the dead node's lock resources
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
return 0;
master_here:
- mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node "
- "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
- dlm->node_num, dlm->reco.dead_node, dlm->name);
+ dlm_print_recovery_master(dlm);
status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
if (status < 0) {
/* we should never hit this anymore */
- mlog(ML_ERROR, "error %d remastering locks for node %u, "
- "retrying.\n", status, dlm->reco.dead_node);
+ mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
+ "retrying.\n", dlm->name, status, dlm->reco.dead_node);
/* yield a bit to allow any final network messages
* to get handled on remaining nodes */
msleep(100);
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
- mlog(0, "requesting lock info from node %u\n",
+ mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
ndata->node_num);
if (ndata->node_num == dlm->node_num) {
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
spin_unlock(&dlm_reco_state_lock);
}
- mlog(0, "done requesting all lock info\n");
+ mlog(0, "%s: Done requesting all lock info\n", dlm->name);
/* nodes should be sending reco data now
* just need to wait */
@@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
/* negative status is handled by caller */
if (ret < 0)
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
- dlm->key, request_from);
-
+ mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
+ "to recover dead node %u\n", dlm->name, ret,
+ request_from, dead_node);
// return from here, then
// sleep until all received or error
return ret;
@@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
sizeof(done_msg), send_to, &tmpret);
if (ret < 0) {
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
- dlm->key, send_to);
+ mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
+ "to recover dead node %u\n", dlm->name, ret, send_to,
+ dead_node);
if (!dlm_is_host_down(ret)) {
BUG();
}
@@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
if (ret < 0) {
/* XXX: negative status is not handled.
* this will end up killing this node. */
- mlog(ML_ERROR, "Error %d when sending message %u (key "
- "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
- dlm->key, send_to);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
+ "node %u (%s)\n", dlm->name, mres->lockname_len,
+ mres->lockname, ret, send_to,
+ (orig_flags & DLM_MRES_MIGRATION ?
+ "migration" : "recovery"));
} else {
/* might get an -ENOMEM back here */
ret = status;
@@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
dlm->name, mres->lockname_len, mres->lockname,
from);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(from, res);
+ dlm_lockres_set_refmap_bit(dlm, res, from);
spin_unlock(&res->spinlock);
added++;
break;
@@ -1965,7 +1972,7 @@ skip_lvb:
mlog(0, "%s:%.*s: added lock for node %u, "
"setting refmap bit\n", dlm->name,
res->lockname.len, res->lockname.name, ml->node);
- dlm_lockres_set_refmap_bit(ml->node, res);
+ dlm_lockres_set_refmap_bit(dlm, res, ml->node);
added++;
}
spin_unlock(&res->spinlock);
@@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
if (res->owner == dead_node) {
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
list_del_init(&res->recovering);
spin_lock(&res->spinlock);
/* new_master has our reference from
@@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
- if (res->state & DLM_LOCK_RES_RECOVERING) {
- if (res->owner == dead_node) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, but "
- "clearing state anyway\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else if (res->owner == dlm->node_num) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, "
- "owner is THIS node, clearing\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else
- continue;
+ if (!(res->state & DLM_LOCK_RES_RECOVERING))
+ continue;
- if (!list_empty(&res->recovering)) {
- mlog(0, "%s:%.*s: lockres was "
- "marked RECOVERING, owner=%u\n",
- dlm->name, res->lockname.len,
- res->lockname.name, res->owner);
- list_del_init(&res->recovering);
- dlm_lockres_put(res);
- }
- spin_lock(&res->spinlock);
- /* new_master has our reference from
- * the lock state sent during recovery */
- dlm_change_lockres_owner(dlm, res, new_master);
- res->state &= ~DLM_LOCK_RES_RECOVERING;
- if (__dlm_lockres_has_locks(res))
- __dlm_dirty_lockres(dlm, res);
- spin_unlock(&res->spinlock);
- wake_up(&res->wq);
+ if (res->owner != dead_node &&
+ res->owner != dlm->node_num)
+ continue;
+
+ if (!list_empty(&res->recovering)) {
+ list_del_init(&res->recovering);
+ dlm_lockres_put(res);
}
+
+ /* new_master has our reference from
+ * the lock state sent during recovery */
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
+ spin_lock(&res->spinlock);
+ dlm_change_lockres_owner(dlm, res, new_master);
+ res->state &= ~DLM_LOCK_RES_RECOVERING;
+ if (__dlm_lockres_has_locks(res))
+ __dlm_dirty_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
}
}
}
@@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, freed, dead_node);
__dlm_print_one_lock_resource(res);
}
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
"no locks and had not purged before dying\n", dlm->name,
res->lockname.len, res->lockname.name, dead_node);
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
}
/* do not kick thread yet */
@@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
- mlog(ML_NOTICE, "Ignore %.*s for "
+ mlog(ML_NOTICE, "%s: res %.*s, Skip "
"recovery as it is being freed\n",
- res->lockname.len,
+ dlm->name, res->lockname.len,
res->lockname.name);
} else
dlm_move_lockres_to_recovery_list(dlm,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 1d6d1d22c47..e73c833fc2a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
int bit;
+ assert_spin_locked(&res->spinlock);
+
if (__dlm_lockres_has_locks(res))
return 0;
+ /* Locks are in the process of being created */
+ if (res->inflight_locks)
+ return 0;
+
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
if (res->state & DLM_LOCK_RES_RECOVERING)
return 0;
+ /* Another node has this resource with this node as the master */
bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES)
return 0;
- /*
- * since the bit for dlm->node_num is not set, inflight_locks better
- * be zero
- */
- BUG_ON(res->inflight_locks != 0);
return 1;
}
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
/* clear our bit from the master's refmap, ignore errors */
ret = dlm_drop_lockres_ref(dlm, res);
if (ret < 0) {
- mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
- res->lockname.len, res->lockname.name, ret);
if (!dlm_is_host_down(ret))
BUG();
}
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
BUG();
}
- __dlm_unhash_lockres(res);
+ __dlm_unhash_lockres(dlm, res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e1ed5e502ff..81a4cd22f80 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode)
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- if (ocfs2_mount_local(osb))
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_open_lockres;
@@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (write)
+ status = -EROFS;
+ goto out;
+ }
+
if (ocfs2_mount_local(osb))
goto out;
@@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
if (ocfs2_is_hard_readonly(osb)) {
if (ex)
status = -EROFS;
- goto bail;
+ goto getbh;
}
if (ocfs2_mount_local(osb))
@@ -2356,7 +2362,7 @@ local:
mlog_errno(status);
goto bail;
}
-
+getbh:
if (ret_bh) {
status = ocfs2_assign_bh(inode, ret_bh, local_bh);
if (status < 0) {
@@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
BUG_ON(!dl);
- if (ocfs2_is_hard_readonly(osb))
- return -EROFS;
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (ex)
+ return -EROFS;
+ return 0;
+ }
if (ocfs2_mount_local(osb))
return 0;
@@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (!ocfs2_mount_local(osb))
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
}
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 23457b491e8..2f5b92ef0e5 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,6 +832,102 @@ out:
return ret;
}
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+ unsigned int is_last = 0, is_data = 0;
+ u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 cpos, cend, clen, hole_size;
+ u64 extoff, extlen;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
+
+ BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (*offset >= inode->i_size) {
+ ret = -ENXIO;
+ goto out_unlock;
+ }
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (origin == SEEK_HOLE)
+ *offset = inode->i_size;
+ goto out_unlock;
+ }
+
+ clen = 0;
+ cpos = *offset >> cs_bits;
+ cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
+
+ while (cpos < cend && !is_last) {
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
+ &rec, &is_last);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ extoff = cpos;
+ extoff <<= cs_bits;
+
+ if (rec.e_blkno == 0ULL) {
+ clen = hole_size;
+ is_data = 0;
+ } else {
+ clen = le16_to_cpu(rec.e_leaf_clusters) -
+ (cpos - le32_to_cpu(rec.e_cpos));
+ is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
+ }
+
+ if ((!is_data && origin == SEEK_HOLE) ||
+ (is_data && origin == SEEK_DATA)) {
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ if (!is_last)
+ cpos += clen;
+ }
+
+ if (origin == SEEK_HOLE) {
+ extoff = cpos;
+ extoff <<= cs_bits;
+ extlen = clen;
+ extlen <<= cs_bits;
+
+ if ((extoff + extlen) > inode->i_size)
+ extlen = inode->i_size - extoff;
+ extoff += extlen;
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ ret = -ENXIO;
+
+out_unlock:
+
+ brelse(di_bh);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ocfs2_inode_unlock(inode, 0);
+out:
+ if (ret && ret != -ENXIO)
+ ret = -ENXIO;
+ return ret;
+}
+
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e79d41c2c90..67ea57d2fd5 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len);
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
+
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
struct ocfs2_extent_list *el,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index de4ea1af041..6e396683c3d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (ret < 0)
mlog_errno(ret);
+ if (file->f_flags & O_SYNC)
+ handle->h_sync = 1;
+
ocfs2_commit_trans(osb, handle);
out_inode_unlock:
@@ -2052,6 +2055,23 @@ out:
return ret;
}
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+ int blockmask = inode->i_sb->s_blocksize - 1;
+ loff_t final_size = pos + count;
+
+ if ((pos & blockmask) || (final_size & blockmask))
+ return 1;
+ return 0;
+}
+
static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
struct file *file,
loff_t pos, size_t count,
@@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
+ int unaligned_dio = 0;
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2297,6 +2318,10 @@ relock:
goto out;
}
+ if (direct_io && !is_sync_kiocb(iocb))
+ unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+ *ppos);
+
/*
* We can't complete the direct I/O as requested, fall back to
* buffered I/O.
@@ -2311,6 +2336,18 @@ relock:
goto relock;
}
+ if (unaligned_dio) {
+ /*
+ * Wait on previous unaligned aio to complete before
+ * proceeding.
+ */
+ ocfs2_aiodio_wait(inode);
+
+ /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+ atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+ ocfs2_iocb_set_unaligned_aio(iocb);
+ }
+
/*
* To later detect whether a journal commit for sync writes is
* necessary, we sample i_size, and cluster count here.
@@ -2382,8 +2419,12 @@ out_dio:
if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
have_alloc_sem = 0;
+ unaligned_dio = 0;
}
+ if (unaligned_dio)
+ atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+
out:
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
@@ -2591,6 +2632,57 @@ bail:
return ret;
}
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ switch (origin) {
+ case SEEK_SET:
+ break;
+ case SEEK_END:
+ offset += inode->i_size;
+ break;
+ case SEEK_CUR:
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out;
+ }
+ offset += file->f_pos;
+ break;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+ if (ret)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+ ret = -EINVAL;
+ if (!ret && offset > inode->i_sb->s_maxbytes)
+ ret = -EINVAL;
+ if (ret)
+ goto out;
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;
+ return offset;
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = {
* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
*/
const struct file_operations ocfs2_fops = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
@@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = {
* the cluster.
*/
const struct file_operations ocfs2_fops_no_plocks = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index a22d2c09889..17454a904d7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
trace_ocfs2_cleanup_delete_inode(
(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
if (sync_data)
- write_inode_now(inode, 1);
+ filemap_write_and_wait(inode->i_mapping);
truncate_inode_pages(&inode->i_data, 0);
}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1c508b149b3..88924a3133f 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
+ /* Number of outstanding AIO's which are not page aligned */
+ atomic_t ip_unaligned_aio;
+
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index bc91072b721..726ff265b29 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
if (!capable(CAP_LINUX_IMMUTABLE))
- goto bail_unlock;
+ goto bail_commit;
}
ocfs2_inode->ip_attr = flags;
@@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if (status < 0)
mlog_errno(status);
+bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
ocfs2_inode_unlock(inode, 1);
@@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
if (!oifi) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_err;
}
if (o2info_from_user(*oifi, req))
@@ -431,7 +432,7 @@ bail:
o2info_set_request_error(&oifi->ifi_req, req);
kfree(oifi);
-
+out_err:
return status;
}
@@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
if (!oiff) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_err;
}
if (o2info_from_user(*oiff, req))
@@ -716,7 +717,7 @@ bail:
o2info_set_request_error(&oiff->iff_req, req);
kfree(oiff);
-
+out_err:
return status;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 295d56454e8..0a42ae96dca 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* we need to run complete recovery for offline orphan slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
- mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
- node_num, slot_num,
- MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
jbd2_journal_destroy(journal);
+ printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
done:
/* drop the lock on this nodes journal */
if (got_lock)
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
* every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
* is done to catch any orphans that are left over in orphan directories.
*
+ * It scans all slots, even ones that are in use. It does so to handle the
+ * case described below:
+ *
+ * Node 1 has an inode it was using. The dentry went away due to memory
+ * pressure. Node 1 closes the inode, but it's on the free list. The node
+ * has the open lock.
+ * Node 2 unlinks the inode. It grabs the dentry lock to notify others,
+ * but node 1 has no dentry and doesn't get the message. It trylocks the
+ * open lock, sees that another node has a PR, and does nothing.
+ * Later node 2 runs its orphan dir. It igets the inode, trylocks the
+ * open lock, sees the PR still, and does nothing.
+ * Basically, we have to trigger an orphan iput on node 1. The only way
+ * for this to happen is if node 1 runs node 2's orphan dir.
+ *
* ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
* seconds. It gets an EX lock on os_lockres and checks sequence number
* stored in LVB. If the sequence number has changed, it means some other
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 68cf2f6d3c6..a3385b63ff5 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir + index leaf + dx root update for free list */
+ * update on dir + index leaf + dx root update for free list +
+ * previous dirblock update in the free list */
static inline int ocfs2_link_credits(struct super_block *sb)
{
- return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
+ return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
ocfs2_quota_trans_credits(sb);
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3e9393ca39e..9cd41083e99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct page *page)
{
- int ret;
+ int ret = VM_FAULT_NOPAGE;
struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
void *fsdata;
loff_t size = i_size_read(inode);
- /*
- * Another node might have truncated while we were waiting on
- * cluster locks.
- * We don't check size == 0 before the shift. This is borrowed
- * from do_generic_file_read.
- */
last_index = (size - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(!size || page->index > last_index)) {
- ret = -EINVAL;
- goto out;
- }
/*
- * The i_size check above doesn't catch the case where nodes
- * truncated and then re-extended the file. We'll re-check the
- * page mapping after taking the page lock inside of
- * ocfs2_write_begin_nolock().
+ * There are cases that lead to the page no longer bebongs to the
+ * mapping.
+ * 1) pagecache truncates locally due to memory pressure.
+ * 2) pagecache truncates when another is taking EX lock against
+ * inode lock. see ocfs2_data_convert_worker.
+ *
+ * The i_size check doesn't catch the case where nodes truncated and
+ * then re-extended the file. We'll re-check the page mapping after
+ * taking the page lock inside of ocfs2_write_begin_nolock().
+ *
+ * Let VM retry with these cases.
*/
- if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
- /*
- * the page has been umapped in ocfs2_data_downconvert_worker.
- * So return 0 here and let VFS retry.
- */
- ret = 0;
+ if ((page->mapping != inode->i_mapping) ||
+ (!PageUptodate(page)) ||
+ (page_offset(page) >= size))
goto out;
- }
/*
* Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
- fsdata);
- if (ret < 0) {
- mlog_errno(ret);
+ if (!locked_page) {
+ ret = VM_FAULT_NOPAGE;
goto out;
}
+ ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+ fsdata);
BUG_ON(ret != len);
- ret = 0;
+ ret = VM_FAULT_LOCKED;
out:
return ret;
}
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out:
ocfs2_unblock_signals(&oldset);
- if (ret)
- ret = VM_FAULT_SIGBUS;
return ret;
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index d53cb706f14..184c76b8c29 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
*/
ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
new_phys_cpos);
- if (!new_phys_cpos) {
+ if (!*new_phys_cpos) {
ret = -ENOSPC;
goto out_commit;
}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 409285854f6..d355e6e36b3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
{
- __test_and_set_bit_le(bit, bitmap);
+ __set_bit_le(bit, bitmap);
}
#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
{
- __test_and_clear_bit_le(bit, bitmap);
+ __clear_bit_le(bit, bitmap);
}
#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
#define ocfs2_test_bit test_bit_le
#define ocfs2_find_next_zero_bit find_next_zero_bit_le
#define ocfs2_find_next_bit find_next_bit_le
+
+static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+ *bit += ((unsigned long) addr & 7UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+ *bit += ((unsigned long) addr & 3UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+ return addr;
+}
+
+static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_set_bit(bit, bitmap);
+}
+
+static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_clear_bit(bit, bitmap);
+}
+
+static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ return ocfs2_test_bit(bit, bitmap);
+}
+
+static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
+ int start)
+{
+ int fix = 0, ret, tmpmax;
+ bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
+ tmpmax = max + fix;
+ start += fix;
+
+ ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
+}
+
#endif /* OCFS2_H */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc8007fc924..f100bf70a90 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
int status = 0;
struct ocfs2_quota_recovery *rec;
- mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
rec = ocfs2_alloc_quota_recovery();
if (!rec)
return ERR_PTR(-ENOMEM);
@@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
goto out_commit;
}
lock_buffer(qbh);
- WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
- ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+ WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
+ ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(qbh);
ocfs2_journal_dirty(handle, qbh);
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
struct inode *lqinode;
unsigned int flags;
- mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
for (type = 0; type < MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
/* Someone else is holding the lock? Then he must be
* doing the recovery. Just skip the file... */
if (status == -EAGAIN) {
- mlog(ML_NOTICE, "skipping quota recovery for slot %d "
- "because quota file is locked.\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
+ "device (%s) for slot %d because quota file is "
+ "locked.\n", osb->dev_str, slot_num);
status = 0;
goto out_put;
} else if (status < 0) {
@@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
* ol_quota_entries_per_block(sb);
}
- found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+ found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
/* We failed? */
if (found == len) {
mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
@@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
struct ocfs2_local_disk_chunk *dchunk;
dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
- ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+ ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, -1);
}
@@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
(od->dq_chunk->qc_headerbh->b_data);
/* Mark structure as freed */
lock_buffer(od->dq_chunk->qc_headerbh);
- ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+ ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(od->dq_chunk->qc_headerbh);
ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 26fc0014d50..1424c151ccc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
goto bail;
}
} else
- mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
- slot);
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 19965b00c43..94368017edb 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -28,6 +28,7 @@
#include "cluster/masklog.h"
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
#include "stackglue.h"
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
}
/*
+ * Check if this node is heartbeating and is connected to all other
+ * heartbeating nodes.
+ */
+static int o2cb_cluster_check(void)
+{
+ u8 node_num;
+ int i;
+ unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+ node_num = o2nm_this_node();
+ if (node_num == O2NM_MAX_NODES) {
+ printk(KERN_ERR "o2cb: This node has not been configured.\n");
+ return -EINVAL;
+ }
+
+ /*
+ * o2dlm expects o2net sockets to be created. If not, then
+ * dlm_join_domain() fails with a stack of errors which are both cryptic
+ * and incomplete. The idea here is to detect upfront whether we have
+ * managed to connect to all nodes or not. If not, then list the nodes
+ * to allow the user to check the configuration (incorrect IP, firewall,
+ * etc.) Yes, this is racy. But its not the end of the world.
+ */
+#define O2CB_MAP_STABILIZE_COUNT 60
+ for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
+ o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ if (!test_bit(node_num, hbmap)) {
+ printk(KERN_ERR "o2cb: %s heartbeat has not been "
+ "started.\n", (o2hb_global_heartbeat_active() ?
+ "Global" : "Local"));
+ return -EINVAL;
+ }
+ o2net_fill_node_map(netmap, sizeof(netmap));
+ /* Force set the current node to allow easy compare */
+ set_bit(node_num, netmap);
+ if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ return 0;
+ if (i < O2CB_MAP_STABILIZE_COUNT)
+ msleep(1000);
+ }
+
+ printk(KERN_ERR "o2cb: This node could not connect to nodes:");
+ i = -1;
+ while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (!test_bit(i, netmap))
+ printk(" %u", i);
+ }
+ printk(".\n");
+
+ return -ENOTCONN;
+}
+
+/*
* Called from the dlm when it's about to evict a node. This is how the
* classic stack signals node death.
*/
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
{
struct ocfs2_cluster_connection *conn = data;
- mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
- node_num, conn->cc_namelen, conn->cc_name);
+ printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
+ node_num, conn->cc_namelen, conn->cc_name);
conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
}
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
BUG_ON(conn == NULL);
BUG_ON(conn->cc_proto == NULL);
- /* for now we only have one cluster/node, make sure we see it
- * in the heartbeat universe */
- if (!o2hb_check_local_node_heartbeating()) {
- if (o2hb_global_heartbeat_active())
- mlog(ML_ERROR, "Global heartbeat not started\n");
- rc = -EINVAL;
+ /* Ensure cluster stack is up and all nodes are connected */
+ rc = o2cb_cluster_check();
+ if (rc) {
+ printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
+ "before retrying.\n");
goto out;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 56f61027236..4994f8b0e60 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -54,6 +54,7 @@
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "aops.h"
#include "blockcheck.h"
#include "dlmglue.h"
#include "export.h"
@@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
ocfs2_set_ro_flag(osb, 1);
- printk(KERN_NOTICE "Readonly device detected. No cluster "
- "services will be utilized for this mount. Recovery "
- "will be skipped.\n");
+ printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+ "Cluster services will not be used for this mount. "
+ "Recovery will be skipped.\n", osb->dev_str);
}
if (!ocfs2_is_hard_readonly(osb)) {
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
return 0;
}
+wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
static int __init ocfs2_init(void)
{
- int status;
+ int status, i;
ocfs2_print_version();
+ for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
+ init_waitqueue_head(&ocfs2__ioend_wq[i]);
+
status = init_ocfs2_uptodate_cache();
if (status < 0) {
mlog_errno(status);
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
oi->ip_dir_start_lookup = 0;
-
+ atomic_set(&oi->ip_unaligned_aio, 0);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
* If we failed before we got a uuid_str yet, we can't stop
* heartbeat. Otherwise, do it.
*/
- if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+ !ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
if (osb->cconn)
@@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog_errno(status);
goto bail;
}
- cleancache_init_shared_fs((char *)&uuid_net_key, sb);
+ cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
bail:
return status;
@@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
} else {
- mlog(ML_NOTICE, "File system was not unmounted cleanly, "
- "recovering volume.\n");
+ printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+ "unmounted cleanly, recovering it.\n", osb->dev_str);
}
local = ocfs2_mount_local(osb);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 194fb22ef79..aa9e8777b09 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode,
}
ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
- if (ret < 0) {
- mlog_errno(ret);
- break;
- }
ocfs2_commit_trans(osb, ctxt.handle);
if (ctxt.meta_ac) {
ocfs2_free_alloc_context(ctxt.meta_ac);
ctxt.meta_ac = NULL;
}
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
}
if (ctxt.meta_ac)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2db1bd3173b..851ba3dcdc2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1652,46 +1652,12 @@ out:
return error;
}
-static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
-{
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
- int rc;
-
- if (task == NULL)
- return -ESRCH;
-
- rc = -EACCES;
- if (lock_trace(task))
- goto out_task;
-
- generic_fillattr(inode, stat);
- unlock_trace(task);
- rc = 0;
-out_task:
- put_task_struct(task);
- return rc;
-}
-
static const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
.follow_link = proc_pid_follow_link,
.setattr = proc_setattr,
};
-static const struct inode_operations proc_fdinfo_link_inode_operations = {
- .setattr = proc_setattr,
- .getattr = proc_pid_fd_link_getattr,
-};
-
-static const struct inode_operations proc_fd_link_inode_operations = {
- .readlink = proc_pid_readlink,
- .follow_link = proc_pid_follow_link,
- .setattr = proc_setattr,
- .getattr = proc_pid_fd_link_getattr,
-};
-
/* building an inode */
@@ -1923,61 +1889,49 @@ out:
static int proc_fd_info(struct inode *inode, struct path *path, char *info)
{
- struct task_struct *task;
- struct files_struct *files;
+ struct task_struct *task = get_proc_task(inode);
+ struct files_struct *files = NULL;
struct file *file;
int fd = proc_fd(inode);
- int rc;
-
- task = get_proc_task(inode);
- if (!task)
- return -ENOENT;
-
- rc = -EACCES;
- if (lock_trace(task))
- goto out_task;
-
- rc = -ENOENT;
- files = get_files_struct(task);
- if (files == NULL)
- goto out_unlock;
- /*
- * We are not taking a ref to the file structure, so we must
- * hold ->file_lock.
- */
- spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
- if (file) {
- unsigned int f_flags;
- struct fdtable *fdt;
-
- fdt = files_fdtable(files);
- f_flags = file->f_flags & ~O_CLOEXEC;
- if (FD_ISSET(fd, fdt->close_on_exec))
- f_flags |= O_CLOEXEC;
-
- if (path) {
- *path = file->f_path;
- path_get(&file->f_path);
+ if (task) {
+ files = get_files_struct(task);
+ put_task_struct(task);
+ }
+ if (files) {
+ /*
+ * We are not taking a ref to the file structure, so we must
+ * hold ->file_lock.
+ */
+ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (file) {
+ unsigned int f_flags;
+ struct fdtable *fdt;
+
+ fdt = files_fdtable(files);
+ f_flags = file->f_flags & ~O_CLOEXEC;
+ if (FD_ISSET(fd, fdt->close_on_exec))
+ f_flags |= O_CLOEXEC;
+
+ if (path) {
+ *path = file->f_path;
+ path_get(&file->f_path);
+ }
+ if (info)
+ snprintf(info, PROC_FDINFO_MAX,
+ "pos:\t%lli\n"
+ "flags:\t0%o\n",
+ (long long) file->f_pos,
+ f_flags);
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+ return 0;
}
- if (info)
- snprintf(info, PROC_FDINFO_MAX,
- "pos:\t%lli\n"
- "flags:\t0%o\n",
- (long long) file->f_pos,
- f_flags);
- rc = 0;
- } else
- rc = -ENOENT;
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-
-out_unlock:
- unlock_trace(task);
-out_task:
- put_task_struct(task);
- return rc;
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+ }
+ return -ENOENT;
}
static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2072,7 +2026,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
spin_unlock(&files->file_lock);
put_files_struct(files);
- inode->i_op = &proc_fd_link_inode_operations;
+ inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2104,12 +2058,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
if (fd == ~0U)
goto out;
- result = ERR_PTR(-EACCES);
- if (lock_trace(task))
- goto out;
-
result = instantiate(dir, dentry, task, &fd);
- unlock_trace(task);
out:
put_task_struct(task);
out_no_task:
@@ -2129,28 +2078,23 @@ static int proc_readfd_common(struct file * filp, void * dirent,
retval = -ENOENT;
if (!p)
goto out_no_task;
-
- retval = -EACCES;
- if (lock_trace(p))
- goto out;
-
retval = 0;
fd = filp->f_pos;
switch (fd) {
case 0:
if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
- goto out_unlock;
+ goto out;
filp->f_pos++;
case 1:
ino = parent_ino(dentry);
if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
- goto out_unlock;
+ goto out;
filp->f_pos++;
default:
files = get_files_struct(p);
if (!files)
- goto out_unlock;
+ goto out;
rcu_read_lock();
for (fd = filp->f_pos-2;
fd < files_fdtable(files)->max_fds;
@@ -2174,9 +2118,6 @@ static int proc_readfd_common(struct file * filp, void * dirent,
rcu_read_unlock();
put_files_struct(files);
}
-
-out_unlock:
- unlock_trace(p);
out:
put_task_struct(p);
out_no_task:
@@ -2254,7 +2195,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
ei->fd = fd;
inode->i_mode = S_IFREG | S_IRUSR;
inode->i_fop = &proc_fdinfo_file_operations;
- inode->i_op = &proc_fdinfo_link_inode_operations;
d_set_d_op(dentry, &tid_fd_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 586174168e2..80e4645f799 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(i.freeswap),
K(global_page_state(NR_FILE_DIRTY)),
K(global_page_state(NR_WRITEBACK)),
- K(global_page_state(NR_ANON_PAGES)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(global_page_state(NR_ANON_PAGES)
+ global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
- HPAGE_PMD_NR
+ HPAGE_PMD_NR),
+#else
+ K(global_page_state(NR_ANON_PAGES)),
#endif
- ),
K(global_page_state(NR_FILE_MAPPED)),
K(global_page_state(NR_SHMEM)),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 42b274da92c..2a30d67dd6b 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu)
idle = kstat_cpu(cpu).cpustat.idle;
idle = cputime64_add(idle, arch_idle_time(cpu));
} else
- idle = usecs_to_cputime(idle_time);
+ idle = nsecs_to_jiffies64(1000 * idle_time);
return idle;
}
@@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu)
/* !NO_HZ so we can rely on cpustat.iowait */
iowait = kstat_cpu(cpu).cpustat.iowait;
else
- iowait = usecs_to_cputime(iowait_time);
+ iowait = nsecs_to_jiffies64(1000 * iowait_time);
return iowait;
}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2bd620f0d79..57bbf9078ac 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -167,6 +167,7 @@ int pstore_register(struct pstore_info *psi)
}
psinfo = psi;
+ mutex_init(&psinfo->read_mutex);
spin_unlock(&pstore_lock);
if (owner && !try_module_get(owner)) {
@@ -195,30 +196,32 @@ EXPORT_SYMBOL_GPL(pstore_register);
void pstore_get_records(int quiet)
{
struct pstore_info *psi = psinfo;
+ char *buf = NULL;
ssize_t size;
u64 id;
enum pstore_type_id type;
struct timespec time;
int failed = 0, rc;
- unsigned long flags;
if (!psi)
return;
- spin_lock_irqsave(&psinfo->buf_lock, flags);
+ mutex_lock(&psi->read_mutex);
rc = psi->open(psi);
if (rc)
goto out;
- while ((size = psi->read(&id, &type, &time, psi)) > 0) {
- rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
+ while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) {
+ rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size,
time, psi);
+ kfree(buf);
+ buf = NULL;
if (rc && (rc != -EEXIST || !quiet))
failed++;
}
psi->close(psi);
out:
- spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+ mutex_unlock(&psi->read_mutex);
if (failed)
printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 05d6b0e78c9..dba43c3ea3a 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path);
/*
* Same as seq_path, but relative to supplied root.
- *
- * root may be changed, see __d_path().
*/
int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
char *esc)
@@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
char *p;
p = __d_path(path, root, buf, size);
+ if (!p)
+ return SEQ_SKIP;
res = PTR_ERR(p);
if (!IS_ERR(p)) {
char *end = mangle_path(buf, p, esc);
@@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
}
seq_commit(m, res);
- return res < 0 ? res : 0;
+ return res < 0 && res != -ENAMETOOLONG ? res : 0;
}
/*
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b6c4b3795c4..76e4266d2e7 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -42,6 +42,8 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
int count, i;
count = be32_to_cpu(aclp->acl_cnt);
+ if (count > XFS_ACL_MAX_ENTRIES)
+ return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
if (!acl)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 33b13310ee0..574d4ee9b62 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_end_io(
int error = 0;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- error = -EIO;
+ ioend->io_error = -EIO;
goto done;
}
if (ioend->io_error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d4906e7c978..c1b55e59655 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
/*
* Query whether the requested number of additional bytes of extended
* attribute space will be able to fit inline.
+ *
* Returns zero if not, else the di_forkoff fork offset to be used in the
* literal area for attribute data once the new bytes have been added.
*
@@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
int offset;
int minforkoff; /* lower limit on valid forkoff locations */
int maxforkoff; /* upper limit on valid forkoff locations */
- int dsize;
+ int dsize;
xfs_mount_t *mp = dp->i_mount;
offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
@@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
return (offset >= minforkoff) ? minforkoff : 0;
}
- if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
- if (bytes <= XFS_IFORK_ASIZE(dp))
- return dp->i_d.di_forkoff;
+ /*
+ * If the requested numbers of bytes is smaller or equal to the
+ * current attribute fork size we can always proceed.
+ *
+ * Note that if_bytes in the data fork might actually be larger than
+ * the current data fork size is due to delalloc extents. In that
+ * case either the extent count will go down when they are converted
+ * to real extents, or the delalloc conversion will take care of the
+ * literal area rebalancing.
+ */
+ if (bytes <= XFS_IFORK_ASIZE(dp))
+ return dp->i_d.di_forkoff;
+
+ /*
+ * For attr2 we can try to move the forkoff if there is space in the
+ * literal area, but for the old format we are done if there is no
+ * space in the fixed attribute fork.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_ATTR2))
return 0;
- }
dsize = dp->i_df.if_bytes;
-
+
switch (dp->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /*
+ /*
* If there is no attr fork and the data fork is extents,
- * determine if creating the default attr fork will result
- * in the extents form migrating to btree. If so, the
- * minimum offset only needs to be the space required for
+ * determine if creating the default attr fork will result
+ * in the extents form migrating to btree. If so, the
+ * minimum offset only needs to be the space required for
* the btree root.
- */
+ */
if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
xfs_default_attroffset(dp))
dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
break;
-
case XFS_DINODE_FMT_BTREE:
/*
- * If have data btree then keep forkoff if we have one,
- * otherwise we are adding a new attr, so then we set
- * minforkoff to where the btree root can finish so we have
+ * If we have a data btree then keep forkoff if we have one,
+ * otherwise we are adding a new attr, so then we set
+ * minforkoff to where the btree root can finish so we have
* plenty of room for attrs
*/
if (dp->i_d.di_forkoff) {
- if (offset < dp->i_d.di_forkoff)
+ if (offset < dp->i_d.di_forkoff)
return 0;
- else
- return dp->i_d.di_forkoff;
- } else
- dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
+ return dp->i_d.di_forkoff;
+ }
+ dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
break;
}
-
- /*
- * A data fork btree root must have space for at least
+
+ /*
+ * A data fork btree root must have space for at least
* MINDBTPTRS key/ptr pairs if the data fork is small or empty.
*/
minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
@@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
- if (offset >= minforkoff && offset < maxforkoff)
- return offset;
if (offset >= maxforkoff)
return maxforkoff;
+ if (offset >= minforkoff)
+ return offset;
return 0;
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c68baeb0974..d0ab7883705 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2383,6 +2383,8 @@ xfs_bmap_btalloc(
int tryagain;
int error;
+ ASSERT(ap->length);
+
mp = ap->ip->i_mount;
align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
if (unlikely(align)) {
@@ -4629,6 +4631,8 @@ xfs_bmapi_allocate(
int error;
int rt;
+ ASSERT(bma->length > 0);
+
rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
/*
@@ -4849,6 +4853,7 @@ xfs_bmapi_write(
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
ASSERT(!(flags & XFS_BMAPI_IGSTATE));
ASSERT(tp != NULL);
+ ASSERT(len > 0);
whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
@@ -4918,9 +4923,22 @@ xfs_bmapi_write(
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
- bma.length = len;
bma.offset = bno;
+ /*
+ * There's a 32/64 bit type mismatch between the
+ * allocation length request (which can be 64 bits in
+ * length) and the bma length request, which is
+ * xfs_extlen_t and therefore 32 bits. Hence we have to
+ * check for 32-bit overflows and handle them here.
+ */
+ if (len > (xfs_filblks_t)MAXEXTLEN)
+ bma.length = MAXEXTLEN;
+ else
+ bma.length = len;
+
+ ASSERT(len > 0);
+ ASSERT(bma.length > 0);
error = xfs_bmapi_allocate(&bma, flags);
if (error)
goto error0;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1a3513881bc..eac97ef81e2 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -656,7 +656,7 @@ xfs_buf_item_committing(
/*
* This is the ops vector shared by all buf log items.
*/
-static struct xfs_item_ops xfs_buf_item_ops = {
+static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_size = xfs_buf_item_size,
.iop_format = xfs_buf_item_format,
.iop_pin = xfs_buf_item_pin,
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index bb3f71d236d..0dee0b71029 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing(
/*
* This is the ops vector for dquots
*/
-static struct xfs_item_ops xfs_dquot_item_ops = {
+static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_size = xfs_qm_dquot_logitem_size,
.iop_format = xfs_qm_dquot_logitem_format,
.iop_pin = xfs_qm_dquot_logitem_pin,
@@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing(
{
}
-static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_pin = xfs_qm_qoff_logitem_pin,
@@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
/*
* This is the ops vector shared by all quotaoff-start log items.
*/
-static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_pin = xfs_qm_qoff_logitem_pin,
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index da108977b21..558910f5e3c 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -98,22 +98,22 @@ xfs_fs_encode_fh(
switch (fileid_type) {
case FILEID_INO32_GEN_PARENT:
spin_lock(&dentry->d_lock);
- fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino;
+ fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
spin_unlock(&dentry->d_lock);
/*FALLTHRU*/
case FILEID_INO32_GEN:
- fid->i32.ino = inode->i_ino;
+ fid->i32.ino = XFS_I(inode)->i_ino;
fid->i32.gen = inode->i_generation;
break;
case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
spin_lock(&dentry->d_lock);
- fid64->parent_ino = dentry->d_parent->d_inode->i_ino;
+ fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
spin_unlock(&dentry->d_lock);
/*FALLTHRU*/
case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
- fid64->ino = inode->i_ino;
+ fid64->ino = XFS_I(inode)->i_ino;
fid64->gen = inode->i_generation;
break;
}
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d22e6262343..35c2aff38b2 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -217,7 +217,7 @@ xfs_efi_item_committing(
/*
* This is the ops vector shared by all efi log items.
*/
-static struct xfs_item_ops xfs_efi_item_ops = {
+static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
.iop_pin = xfs_efi_item_pin,
@@ -477,7 +477,7 @@ xfs_efd_item_committing(
/*
* This is the ops vector shared by all efd log items.
*/
-static struct xfs_item_ops xfs_efd_item_ops = {
+static const struct xfs_item_ops xfs_efd_item_ops = {
.iop_size = xfs_efd_item_size,
.iop_format = xfs_efd_item_format,
.iop_pin = xfs_efd_item_pin,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c0237c602f1..755ee816488 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,6 +2835,27 @@ corrupt_out:
return XFS_ERROR(EFSCORRUPTED);
}
+void
+xfs_promote_inode(
+ struct xfs_inode *ip)
+{
+ struct xfs_buf *bp;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+
+ bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
+ ip->i_imap.im_len, XBF_TRYLOCK);
+ if (!bp)
+ return;
+
+ if (XFS_BUF_ISDELAYWRITE(bp)) {
+ xfs_buf_delwri_promote(bp);
+ wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
+ }
+
+ xfs_buf_relse(bp);
+}
+
/*
* Return a pointer to the extent record at file index idx.
*/
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 760140d1dd6..b4cd4739f98 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -498,6 +498,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
void xfs_iunpin_wait(xfs_inode_t *);
int xfs_iflush(xfs_inode_t *, uint);
+void xfs_promote_inode(struct xfs_inode *);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index b7cf21ba240..abaafdbb3e6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -795,7 +795,7 @@ xfs_inode_item_committing(
/*
* This is the ops vector shared by all buf log items.
*/
-static struct xfs_item_ops xfs_inode_item_ops = {
+static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_size = xfs_inode_item_size,
.iop_format = xfs_inode_item_format,
.iop_pin = xfs_inode_item_pin,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2758a6277c5..34817adf4b9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -150,6 +150,117 @@ xlog_grant_add_space(
} while (head_val != old);
}
+STATIC bool
+xlog_reserveq_wake(
+ struct log *log,
+ int *free_bytes)
+{
+ struct xlog_ticket *tic;
+ int need_bytes;
+
+ list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+ if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+ need_bytes = tic->t_unit_res * tic->t_cnt;
+ else
+ need_bytes = tic->t_unit_res;
+
+ if (*free_bytes < need_bytes)
+ return false;
+ *free_bytes -= need_bytes;
+
+ trace_xfs_log_grant_wake_up(log, tic);
+ wake_up(&tic->t_wait);
+ }
+
+ return true;
+}
+
+STATIC bool
+xlog_writeq_wake(
+ struct log *log,
+ int *free_bytes)
+{
+ struct xlog_ticket *tic;
+ int need_bytes;
+
+ list_for_each_entry(tic, &log->l_writeq, t_queue) {
+ ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+
+ need_bytes = tic->t_unit_res;
+
+ if (*free_bytes < need_bytes)
+ return false;
+ *free_bytes -= need_bytes;
+
+ trace_xfs_log_regrant_write_wake_up(log, tic);
+ wake_up(&tic->t_wait);
+ }
+
+ return true;
+}
+
+STATIC int
+xlog_reserveq_wait(
+ struct log *log,
+ struct xlog_ticket *tic,
+ int need_bytes)
+{
+ list_add_tail(&tic->t_queue, &log->l_reserveq);
+
+ do {
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto shutdown;
+ xlog_grant_push_ail(log, need_bytes);
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ trace_xfs_log_grant_sleep(log, tic);
+
+ xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+ trace_xfs_log_grant_wake(log, tic);
+
+ spin_lock(&log->l_grant_reserve_lock);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto shutdown;
+ } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+
+ list_del_init(&tic->t_queue);
+ return 0;
+shutdown:
+ list_del_init(&tic->t_queue);
+ return XFS_ERROR(EIO);
+}
+
+STATIC int
+xlog_writeq_wait(
+ struct log *log,
+ struct xlog_ticket *tic,
+ int need_bytes)
+{
+ list_add_tail(&tic->t_queue, &log->l_writeq);
+
+ do {
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto shutdown;
+ xlog_grant_push_ail(log, need_bytes);
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ trace_xfs_log_regrant_write_sleep(log, tic);
+
+ xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+ trace_xfs_log_regrant_write_wake(log, tic);
+
+ spin_lock(&log->l_grant_write_lock);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto shutdown;
+ } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
+
+ list_del_init(&tic->t_queue);
+ return 0;
+shutdown:
+ list_del_init(&tic->t_queue);
+ return XFS_ERROR(EIO);
+}
+
static void
xlog_tic_reset_res(xlog_ticket_t *tic)
{
@@ -350,8 +461,19 @@ xfs_log_reserve(
retval = xlog_grant_log_space(log, internal_ticket);
}
+ if (unlikely(retval)) {
+ /*
+ * If we are failing, make sure the ticket doesn't have any
+ * current reservations. We don't want to add this back
+ * when the ticket/ transaction gets cancelled.
+ */
+ internal_ticket->t_curr_res = 0;
+ /* ungrant will give back unit_res * t_cnt. */
+ internal_ticket->t_cnt = 0;
+ }
+
return retval;
-} /* xfs_log_reserve */
+}
/*
@@ -626,7 +748,7 @@ xfs_log_item_init(
struct xfs_mount *mp,
struct xfs_log_item *item,
int type,
- struct xfs_item_ops *ops)
+ const struct xfs_item_ops *ops)
{
item->li_mountp = mp;
item->li_ailp = mp->m_ail;
@@ -2481,8 +2603,8 @@ restart:
/*
* Atomically get the log space required for a log ticket.
*
- * Once a ticket gets put onto the reserveq, it will only return after
- * the needed reservation is satisfied.
+ * Once a ticket gets put onto the reserveq, it will only return after the
+ * needed reservation is satisfied.
*
* This function is structured so that it has a lock free fast path. This is
* necessary because every new transaction reservation will come through this
@@ -2490,113 +2612,53 @@ restart:
* every pass.
*
* As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going
- * to add the ticket to the queue and sleep. We can avoid taking the lock if the
- * ticket was never added to the reserveq because the t_queue list head will be
- * empty and we hold the only reference to it so it can safely be checked
- * unlocked.
+ * l_grant_reserve_lock, we only need to take that lock if we are going to add
+ * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
+ * was never added to the reserveq because the t_queue list head will be empty
+ * and we hold the only reference to it so it can safely be checked unlocked.
*/
STATIC int
-xlog_grant_log_space(xlog_t *log,
- xlog_ticket_t *tic)
+xlog_grant_log_space(
+ struct log *log,
+ struct xlog_ticket *tic)
{
- int free_bytes;
- int need_bytes;
+ int free_bytes, need_bytes;
+ int error = 0;
-#ifdef DEBUG
- if (log->l_flags & XLOG_ACTIVE_RECOVERY)
- panic("grant Recovery problem");
-#endif
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
trace_xfs_log_grant_enter(log, tic);
+ /*
+ * If there are other waiters on the queue then give them a chance at
+ * logspace before us. Wake up the first waiters, if we do not wake
+ * up all the waiters then go to sleep waiting for more free space,
+ * otherwise try to get some space for this transaction.
+ */
need_bytes = tic->t_unit_res;
if (tic->t_flags & XFS_LOG_PERM_RESERV)
need_bytes *= tic->t_ocnt;
-
- /* something is already sleeping; insert new transaction at end */
- if (!list_empty_careful(&log->l_reserveq)) {
- spin_lock(&log->l_grant_reserve_lock);
- /* recheck the queue now we are locked */
- if (list_empty(&log->l_reserveq)) {
- spin_unlock(&log->l_grant_reserve_lock);
- goto redo;
- }
- list_add_tail(&tic->t_queue, &log->l_reserveq);
-
- trace_xfs_log_grant_sleep1(log, tic);
-
- /*
- * Gotta check this before going to sleep, while we're
- * holding the grant lock.
- */
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
-
- XFS_STATS_INC(xs_sleep_logspace);
- xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-
- /*
- * If we got an error, and the filesystem is shutting down,
- * we'll catch it down below. So just continue...
- */
- trace_xfs_log_grant_wake1(log, tic);
- }
-
-redo:
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return_unlocked;
-
free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
- if (free_bytes < need_bytes) {
+ if (!list_empty_careful(&log->l_reserveq)) {
spin_lock(&log->l_grant_reserve_lock);
- if (list_empty(&tic->t_queue))
- list_add_tail(&tic->t_queue, &log->l_reserveq);
-
- trace_xfs_log_grant_sleep2(log, tic);
-
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
-
- xlog_grant_push_ail(log, need_bytes);
-
- XFS_STATS_INC(xs_sleep_logspace);
- xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-
- trace_xfs_log_grant_wake2(log, tic);
- goto redo;
- }
-
- if (!list_empty(&tic->t_queue)) {
+ if (!xlog_reserveq_wake(log, &free_bytes) ||
+ free_bytes < need_bytes)
+ error = xlog_reserveq_wait(log, tic, need_bytes);
+ spin_unlock(&log->l_grant_reserve_lock);
+ } else if (free_bytes < need_bytes) {
spin_lock(&log->l_grant_reserve_lock);
- list_del_init(&tic->t_queue);
+ error = xlog_reserveq_wait(log, tic, need_bytes);
spin_unlock(&log->l_grant_reserve_lock);
}
+ if (error)
+ return error;
- /* we've got enough space */
xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
trace_xfs_log_grant_exit(log, tic);
xlog_verify_grant_tail(log);
return 0;
-
-error_return_unlocked:
- spin_lock(&log->l_grant_reserve_lock);
-error_return:
- list_del_init(&tic->t_queue);
- spin_unlock(&log->l_grant_reserve_lock);
- trace_xfs_log_grant_error(log, tic);
-
- /*
- * If we are failing, make sure the ticket doesn't have any
- * current reservations. We don't want to add this back when
- * the ticket/transaction gets cancelled.
- */
- tic->t_curr_res = 0;
- tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
- return XFS_ERROR(EIO);
-} /* xlog_grant_log_space */
-
+}
/*
* Replenish the byte reservation required by moving the grant write head.
@@ -2605,10 +2667,12 @@ error_return:
* free fast path.
*/
STATIC int
-xlog_regrant_write_log_space(xlog_t *log,
- xlog_ticket_t *tic)
+xlog_regrant_write_log_space(
+ struct log *log,
+ struct xlog_ticket *tic)
{
- int free_bytes, need_bytes;
+ int free_bytes, need_bytes;
+ int error = 0;
tic->t_curr_res = tic->t_unit_res;
xlog_tic_reset_res(tic);
@@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t *log,
if (tic->t_cnt > 0)
return 0;
-#ifdef DEBUG
- if (log->l_flags & XLOG_ACTIVE_RECOVERY)
- panic("regrant Recovery problem");
-#endif
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
trace_xfs_log_regrant_write_enter(log, tic);
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return_unlocked;
- /* If there are other waiters on the queue then give them a
- * chance at logspace before us. Wake up the first waiters,
- * if we do not wake up all the waiters then go to sleep waiting
- * for more free space, otherwise try to get some space for
- * this transaction.
+ /*
+ * If there are other waiters on the queue then give them a chance at
+ * logspace before us. Wake up the first waiters, if we do not wake
+ * up all the waiters then go to sleep waiting for more free space,
+ * otherwise try to get some space for this transaction.
*/
need_bytes = tic->t_unit_res;
- if (!list_empty_careful(&log->l_writeq)) {
- struct xlog_ticket *ntic;
-
- spin_lock(&log->l_grant_write_lock);
- free_bytes = xlog_space_left(log, &log->l_grant_write_head);
- list_for_each_entry(ntic, &log->l_writeq, t_queue) {
- ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
-
- if (free_bytes < ntic->t_unit_res)
- break;
- free_bytes -= ntic->t_unit_res;
- wake_up(&ntic->t_wait);
- }
-
- if (ntic != list_first_entry(&log->l_writeq,
- struct xlog_ticket, t_queue)) {
- if (list_empty(&tic->t_queue))
- list_add_tail(&tic->t_queue, &log->l_writeq);
- trace_xfs_log_regrant_write_sleep1(log, tic);
-
- xlog_grant_push_ail(log, need_bytes);
-
- XFS_STATS_INC(xs_sleep_logspace);
- xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
- trace_xfs_log_regrant_write_wake1(log, tic);
- } else
- spin_unlock(&log->l_grant_write_lock);
- }
-
-redo:
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return_unlocked;
-
free_bytes = xlog_space_left(log, &log->l_grant_write_head);
- if (free_bytes < need_bytes) {
+ if (!list_empty_careful(&log->l_writeq)) {
spin_lock(&log->l_grant_write_lock);
- if (list_empty(&tic->t_queue))
- list_add_tail(&tic->t_queue, &log->l_writeq);
-
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
-
- xlog_grant_push_ail(log, need_bytes);
-
- XFS_STATS_INC(xs_sleep_logspace);
- trace_xfs_log_regrant_write_sleep2(log, tic);
- xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-
- trace_xfs_log_regrant_write_wake2(log, tic);
- goto redo;
- }
-
- if (!list_empty(&tic->t_queue)) {
+ if (!xlog_writeq_wake(log, &free_bytes) ||
+ free_bytes < need_bytes)
+ error = xlog_writeq_wait(log, tic, need_bytes);
+ spin_unlock(&log->l_grant_write_lock);
+ } else if (free_bytes < need_bytes) {
spin_lock(&log->l_grant_write_lock);
- list_del_init(&tic->t_queue);
+ error = xlog_writeq_wait(log, tic, need_bytes);
spin_unlock(&log->l_grant_write_lock);
}
- /* we've got enough space */
+ if (error)
+ return error;
+
xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
trace_xfs_log_regrant_write_exit(log, tic);
xlog_verify_grant_tail(log);
return 0;
-
-
- error_return_unlocked:
- spin_lock(&log->l_grant_write_lock);
- error_return:
- list_del_init(&tic->t_queue);
- spin_unlock(&log->l_grant_write_lock);
- trace_xfs_log_regrant_write_error(log, tic);
-
- /*
- * If we are failing, make sure the ticket doesn't have any
- * current reservations. We don't want to add this back when
- * the ticket/transaction gets cancelled.
- */
- tic->t_curr_res = 0;
- tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
- return XFS_ERROR(EIO);
-} /* xlog_regrant_write_log_space */
-
+}
/* The first cnt-1 times through here we don't need to
* move the grant write head because the permanent
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 78c9039994a..3f7bf451c03 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -137,7 +137,7 @@ struct xfs_trans;
void xfs_log_item_init(struct xfs_mount *mp,
struct xfs_log_item *item,
int type,
- struct xfs_item_ops *ops);
+ const struct xfs_item_ops *ops);
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5cff443f6cd..0bbb1a41998 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -674,7 +674,8 @@ xfs_qm_dqattach_one(
* disk and we didn't ask it to allocate;
* ESRCH if quotas got turned off suddenly.
*/
- error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
+ error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+ doalloc | XFS_QMOPT_DOWARN, &dqp);
if (error)
return error;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index aa3dc1a4d53..be5c51d8f75 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -770,6 +770,17 @@ restart:
if (!xfs_iflock_nowait(ip)) {
if (!(sync_mode & SYNC_WAIT))
goto out;
+
+ /*
+ * If we only have a single dirty inode in a cluster there is
+ * a fair chance that the AIL push may have pushed it into
+ * the buffer, but xfsbufd won't touch it until 30 seconds
+ * from now, and thus we will lock up here.
+ *
+ * Promote the inode buffer to the front of the delwri list
+ * and wake up xfsbufd now.
+ */
+ xfs_promote_inode(ip);
xfs_iflock(ip);
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f1d2802b2f0..49403579887 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 603f3eb5204..3ae713c0abd 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -326,7 +326,7 @@ typedef struct xfs_log_item {
struct xfs_log_item *);
/* buffer item iodone */
/* callback func */
- struct xfs_item_ops *li_ops; /* function list */
+ const struct xfs_item_ops *li_ops; /* function list */
/* delayed logging */
struct list_head li_cil; /* CIL pointers */
@@ -341,7 +341,7 @@ typedef struct xfs_log_item {
{ XFS_LI_IN_AIL, "IN_AIL" }, \
{ XFS_LI_ABORTED, "ABORTED" }
-typedef struct xfs_item_ops {
+struct xfs_item_ops {
uint (*iop_size)(xfs_log_item_t *);
void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
void (*iop_pin)(xfs_log_item_t *);
@@ -352,7 +352,7 @@ typedef struct xfs_item_ops {
void (*iop_push)(xfs_log_item_t *);
bool (*iop_pushbuf)(xfs_log_item_t *);
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
+};
#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4ecf2a54906..ce9268a2f56 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -112,7 +112,7 @@ xfs_readlink(
char *link)
{
xfs_mount_t *mp = ip->i_mount;
- int pathlen;
+ xfs_fsize_t pathlen;
int error = 0;
trace_xfs_readlink(ip);
@@ -122,13 +122,19 @@ xfs_readlink(
xfs_ilock(ip, XFS_ILOCK_SHARED);
- ASSERT(S_ISLNK(ip->i_d.di_mode));
- ASSERT(ip->i_d.di_size <= MAXPATHLEN);
-
pathlen = ip->i_d.di_size;
if (!pathlen)
goto out;
+ if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+ __func__, (unsigned long long) ip->i_ino,
+ (long long) pathlen);
+ ASSERT(0);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+
if (ip->i_df.if_flags & XFS_IFINLINE) {
memcpy(link, ip->i_df.if_u1.if_data, pathlen);
link[pathlen] = '\0';