summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c1
-rw-r--r--fs/Kconfig27
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/bio.c158
-rw-r--r--fs/cramfs/inode.c1
-rw-r--r--fs/dcache.c114
-rw-r--r--fs/dlm/Makefile1
-rw-r--r--fs/dlm/config.c50
-rw-r--r--fs/dlm/config.h3
-rw-r--r--fs/dlm/dlm_internal.h9
-rw-r--r--fs/dlm/lock.c5
-rw-r--r--fs/dlm/lock.h1
-rw-r--r--fs/dlm/main.c7
-rw-r--r--fs/dlm/member.c34
-rw-r--r--fs/dlm/plock.c (renamed from fs/gfs2/locking/dlm/plock.c)169
-rw-r--r--fs/dlm/recoverd.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext2/ioctl.c57
-rw-r--r--fs/ext3/ialloc.c2
-rw-r--r--fs/ext3/inode.c6
-rw-r--r--fs/ext3/ioctl.c103
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c6
-rw-r--r--fs/ext4/ioctl.c86
-rw-r--r--fs/fat/file.c12
-rw-r--r--fs/file_table.c42
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c6
-rw-r--r--fs/gfs2/bmap.c670
-rw-r--r--fs/gfs2/dir.c84
-rw-r--r--fs/gfs2/eattr.c58
-rw-r--r--fs/gfs2/glock.c188
-rw-r--r--fs/gfs2/glock.h14
-rw-r--r--fs/gfs2/glops.c10
-rw-r--r--fs/gfs2/incore.h40
-rw-r--r--fs/gfs2/inode.c72
-rw-r--r--fs/gfs2/inode.h22
-rw-r--r--fs/gfs2/lm.c210
-rw-r--r--fs/gfs2/lm.h42
-rw-r--r--fs/gfs2/locking/dlm/Makefile2
-rw-r--r--fs/gfs2/locking/dlm/lock.c7
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h13
-rw-r--r--fs/gfs2/locking/dlm/main.c10
-rw-r--r--fs/gfs2/locking/dlm/mount.c21
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c2
-rw-r--r--fs/gfs2/locking/dlm/thread.c10
-rw-r--r--fs/gfs2/locking/nolock/main.c2
-rw-r--r--fs/gfs2/log.c19
-rw-r--r--fs/gfs2/lops.c21
-rw-r--r--fs/gfs2/lops.h11
-rw-r--r--fs/gfs2/main.c10
-rw-r--r--fs/gfs2/ops_address.c44
-rw-r--r--fs/gfs2/ops_dentry.c4
-rw-r--r--fs/gfs2/ops_export.c2
-rw-r--r--fs/gfs2/ops_file.c37
-rw-r--r--fs/gfs2/ops_fstype.c80
-rw-r--r--fs/gfs2/ops_inode.c42
-rw-r--r--fs/gfs2/ops_inode.h1
-rw-r--r--fs/gfs2/ops_super.c1
-rw-r--r--fs/gfs2/quota.c74
-rw-r--r--fs/gfs2/quota.h17
-rw-r--r--fs/gfs2/recovery.c15
-rw-r--r--fs/gfs2/rgrp.c370
-rw-r--r--fs/gfs2/rgrp.h8
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/gfs2/super.h1
-rw-r--r--fs/gfs2/sys.c7
-rw-r--r--fs/gfs2/trans.c25
-rw-r--r--fs/gfs2/trans.h2
-rw-r--r--fs/gfs2/util.c24
-rw-r--r--fs/gfs2/util.h2
-rw-r--r--fs/hfsplus/ioctl.c40
-rw-r--r--fs/inode.c51
-rw-r--r--fs/internal.h11
-rw-r--r--fs/jffs2/jffs2_fs_i.h2
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jfs/ioctl.c33
-rw-r--r--fs/jfs/jfs_dmap.c11
-rw-r--r--fs/jfs/jfs_dmap.h2
-rw-r--r--fs/jfs/jfs_imap.c15
-rw-r--r--fs/jfs/jfs_xtree.c26
-rw-r--r--fs/locks.c1
-rw-r--r--fs/namei.c275
-rw-r--r--fs/namespace.c647
-rw-r--r--fs/ncpfs/ioctl.c54
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4recover.c16
-rw-r--r--fs/nfsd/nfs4state.c3
-rw-r--r--fs/nfsd/vfs.c72
-rw-r--r--fs/ocfs2/Makefile14
-rw-r--r--fs/ocfs2/alloc.c465
-rw-r--r--fs/ocfs2/aops.c6
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/netdebug.c441
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/sys.c9
-rw-r--r--fs/ocfs2/cluster/tcp.c164
-rw-r--r--fs/ocfs2/cluster/tcp.h32
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h26
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h49
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c911
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h86
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c70
-rw-r--r--fs/ocfs2/dlm/dlmlock.c22
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c200
-rw-r--r--fs/ocfs2/dlmglue.c645
-rw-r--r--fs/ocfs2/dlmglue.h5
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/heartbeat.c184
-rw-r--r--fs/ocfs2/heartbeat.h17
-rw-r--r--fs/ocfs2/ioctl.c24
-rw-r--r--fs/ocfs2/ioctl.h3
-rw-r--r--fs/ocfs2/journal.c211
-rw-r--r--fs/ocfs2/journal.h4
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/ocfs2.h77
-rw-r--r--fs/ocfs2/ocfs2_fs.h79
-rw-r--r--fs/ocfs2/ocfs2_lockid.h2
-rw-r--r--fs/ocfs2/slot_map.c454
-rw-r--r--fs/ocfs2/slot_map.h32
-rw-r--r--fs/ocfs2/stack_o2cb.c420
-rw-r--r--fs/ocfs2/stack_user.c883
-rw-r--r--fs/ocfs2/stackglue.c568
-rw-r--r--fs/ocfs2/stackglue.h261
-rw-r--r--fs/ocfs2/suballoc.c103
-rw-r--r--fs/ocfs2/suballoc.h1
-rw-r--r--fs/ocfs2/super.c208
-rw-r--r--fs/open.c149
-rw-r--r--fs/partitions/check.c4
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/pnode.c60
-rw-r--r--fs/pnode.h2
-rw-r--r--fs/proc/base.c125
-rw-r--r--fs/proc/proc_net.c6
-rw-r--r--fs/read_write.c6
-rw-r--r--fs/reiserfs/ioctl.c63
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/select.c2
-rw-r--r--fs/seq_file.c113
-rw-r--r--fs/super.c25
-rw-r--r--fs/sysfs/dir.c1
-rw-r--r--fs/sysfs/file.c6
-rw-r--r--fs/sysfs/symlink.c9
-rw-r--r--fs/udf/Makefile2
-rw-r--r--fs/udf/balloc.c13
-rw-r--r--fs/udf/crc.c172
-rw-r--r--fs/udf/dir.c83
-rw-r--r--fs/udf/ecma_167.h13
-rw-r--r--fs/udf/file.c47
-rw-r--r--fs/udf/ialloc.c13
-rw-r--r--fs/udf/inode.c208
-rw-r--r--fs/udf/lowlevel.c1
-rw-r--r--fs/udf/misc.c26
-rw-r--r--fs/udf/namei.c218
-rw-r--r--fs/udf/partition.c67
-rw-r--r--fs/udf/super.c1262
-rw-r--r--fs/udf/symlink.c1
-rw-r--r--fs/udf/truncate.c81
-rw-r--r--fs/udf/udf_i.h30
-rw-r--r--fs/udf/udf_sb.h109
-rw-r--r--fs/udf/udfdecl.h67
-rw-r--r--fs/udf/udfend.h22
-rw-r--r--fs/udf/udftime.c35
-rw-r--r--fs/udf/unicode.c62
-rw-r--r--fs/utimes.c18
-rw-r--r--fs/xattr.c39
-rw-r--r--fs/xfs/Kconfig12
-rw-r--r--fs/xfs/linux-2.6/kmem.c6
-rw-r--r--fs/xfs/linux-2.6/sema.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c14
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c36
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c689
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c230
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c79
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c27
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_vfs.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h30
-rw-r--r--fs/xfs/quota/xfs_dquot.c20
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c14
-rw-r--r--fs/xfs/quota/xfs_qm.c76
-rw-r--r--fs/xfs/quota/xfs_qm.h2
-rw-r--r--fs/xfs/quota/xfs_qm_stats.h4
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c44
-rw-r--r--fs/xfs/support/ktrace.c37
-rw-r--r--fs/xfs/support/ktrace.h3
-rw-r--r--fs/xfs/xfs.h2
-rw-r--r--fs/xfs/xfs_acl.c16
-rw-r--r--fs/xfs/xfs_alloc.c65
-rw-r--r--fs/xfs/xfs_attr.c10
-rw-r--r--fs/xfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/xfs_bmap.c59
-rw-r--r--fs/xfs/xfs_bmap.h2
-rw-r--r--fs/xfs/xfs_bmap_btree.c54
-rw-r--r--fs/xfs/xfs_buf_item.c7
-rw-r--r--fs/xfs/xfs_dir2.c62
-rw-r--r--fs/xfs/xfs_dir2.h12
-rw-r--r--fs/xfs/xfs_filestream.c2
-rw-r--r--fs/xfs/xfs_ialloc.c44
-rw-r--r--fs/xfs/xfs_iget.c49
-rw-r--r--fs/xfs/xfs_inode.c823
-rw-r--r--fs/xfs/xfs_inode.h23
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_inode_item.h8
-rw-r--r--fs/xfs/xfs_iomap.c7
-rw-r--r--fs/xfs/xfs_itable.c7
-rw-r--r--fs/xfs/xfs_log.c259
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_priv.h93
-rw-r--r--fs/xfs/xfs_log_recover.c123
-rw-r--r--fs/xfs/xfs_mount.c66
-rw-r--r--fs/xfs/xfs_mount.h30
-rw-r--r--fs/xfs/xfs_rename.c121
-rw-r--r--fs/xfs/xfs_rtalloc.c41
-rw-r--r--fs/xfs/xfs_rw.c8
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_trans_ail.c151
-rw-r--r--fs/xfs/xfs_trans_buf.c15
-rw-r--r--fs/xfs/xfs_types.h5
-rw-r--r--fs/xfs/xfs_utils.c26
-rw-r--r--fs/xfs/xfs_utils.h15
-rw-r--r--fs/xfs/xfs_vfsops.c76
-rw-r--r--fs/xfs/xfs_vnodeops.c505
-rw-r--r--fs/xfs/xfs_vnodeops.h33
238 files changed, 12005 insertions, 6571 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index dfebdbe7440..3031e3233dd 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -26,7 +26,6 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/idr.h>
-#include <asm/semaphore.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
diff --git a/fs/Kconfig b/fs/Kconfig
index c509123bea4..8b18a875867 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -444,6 +444,32 @@ config OCFS2_FS
For more information on OCFS2, see the file
<file:Documentation/filesystems/ocfs2.txt>.
+config OCFS2_FS_O2CB
+ tristate "O2CB Kernelspace Clustering"
+ depends on OCFS2_FS
+ default y
+ help
+ OCFS2 includes a simple kernelspace clustering package, the OCFS2
+ Cluster Base. It only requires a very small userspace component
+ to configure it. This comes with the standard ocfs2-tools package.
+ O2CB is limited to maintaining a cluster for OCFS2 file systems.
+ It cannot manage any other cluster applications.
+
+ It is always safe to say Y here, as the clustering method is
+ run-time selectable.
+
+config OCFS2_FS_USERSPACE_CLUSTER
+ tristate "OCFS2 Userspace Clustering"
+ depends on OCFS2_FS && DLM
+ default y
+ help
+ This option will allow OCFS2 to use userspace clustering services
+ in conjunction with the DLM in fs/dlm. If you are using a
+ userspace cluster manager, say Y here.
+
+ It is safe to say Y, as the clustering method is run-time
+ selectable.
+
config OCFS2_DEBUG_MASKLOG
bool "OCFS2 logging support"
depends on OCFS2_FS
@@ -663,6 +689,7 @@ config ZISOFS
config UDF_FS
tristate "UDF file system support"
+ select CRC_ITU_T
help
This is the new file system used on some CD-ROMs and DVDs. Say Y if
you intend to mount DVD discs or CDRW's written in packet mode, or
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index b5c3b6114ad..853845abcca 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -62,7 +62,7 @@ config BINFMT_SHARED_FLAT
config BINFMT_AOUT
tristate "Kernel support for a.out and ECOFF binaries"
depends on ARCH_SUPPORTS_AOUT && \
- (X86_32 || ALPHA || ARM || M68K || SPARC32)
+ (X86_32 || ALPHA || ARM || M68K)
---help---
A.out (Assembler.OUTput) is a set of formats for libraries and
executables used in the earliest versions of UNIX. Linux used
diff --git a/fs/bio.c b/fs/bio.c
index 553b5b7960a..6e0b6f66df0 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -444,22 +444,27 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
struct bio_map_data {
struct bio_vec *iovecs;
- void __user *userptr;
+ int nr_sgvecs;
+ struct sg_iovec *sgvecs;
};
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio)
+static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
+ struct sg_iovec *iov, int iov_count)
{
memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
+ memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
+ bmd->nr_sgvecs = iov_count;
bio->bi_private = bmd;
}
static void bio_free_map_data(struct bio_map_data *bmd)
{
kfree(bmd->iovecs);
+ kfree(bmd->sgvecs);
kfree(bmd);
}
-static struct bio_map_data *bio_alloc_map_data(int nr_segs)
+static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
{
struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL);
@@ -467,13 +472,71 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
return NULL;
bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL);
- if (bmd->iovecs)
+ if (!bmd->iovecs) {
+ kfree(bmd);
+ return NULL;
+ }
+
+ bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL);
+ if (bmd->sgvecs)
return bmd;
+ kfree(bmd->iovecs);
kfree(bmd);
return NULL;
}
+static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+ int uncopy)
+{
+ int ret = 0, i;
+ struct bio_vec *bvec;
+ int iov_idx = 0;
+ unsigned int iov_off = 0;
+ int read = bio_data_dir(bio) == READ;
+
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ char *bv_addr = page_address(bvec->bv_page);
+ unsigned int bv_len = bvec->bv_len;
+
+ while (bv_len && iov_idx < iov_count) {
+ unsigned int bytes;
+ char *iov_addr;
+
+ bytes = min_t(unsigned int,
+ iov[iov_idx].iov_len - iov_off, bv_len);
+ iov_addr = iov[iov_idx].iov_base + iov_off;
+
+ if (!ret) {
+ if (!read && !uncopy)
+ ret = copy_from_user(bv_addr, iov_addr,
+ bytes);
+ if (read && uncopy)
+ ret = copy_to_user(iov_addr, bv_addr,
+ bytes);
+
+ if (ret)
+ ret = -EFAULT;
+ }
+
+ bv_len -= bytes;
+ bv_addr += bytes;
+ iov_addr += bytes;
+ iov_off += bytes;
+
+ if (iov[iov_idx].iov_len == iov_off) {
+ iov_idx++;
+ iov_off = 0;
+ }
+ }
+
+ if (uncopy)
+ __free_page(bvec->bv_page);
+ }
+
+ return ret;
+}
+
/**
* bio_uncopy_user - finish previously mapped bio
* @bio: bio being terminated
@@ -484,55 +547,56 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
int bio_uncopy_user(struct bio *bio)
{
struct bio_map_data *bmd = bio->bi_private;
- const int read = bio_data_dir(bio) == READ;
- struct bio_vec *bvec;
- int i, ret = 0;
+ int ret;
- __bio_for_each_segment(bvec, bio, i, 0) {
- char *addr = page_address(bvec->bv_page);
- unsigned int len = bmd->iovecs[i].bv_len;
+ ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
- if (read && !ret && copy_to_user(bmd->userptr, addr, len))
- ret = -EFAULT;
-
- __free_page(bvec->bv_page);
- bmd->userptr += len;
- }
bio_free_map_data(bmd);
bio_put(bio);
return ret;
}
/**
- * bio_copy_user - copy user data to bio
+ * bio_copy_user_iov - copy user data to bio
* @q: destination block queue
- * @uaddr: start of user address
- * @len: length in bytes
+ * @iov: the iovec.
+ * @iov_count: number of elements in the iovec
* @write_to_vm: bool indicating writing to pages or not
*
* Prepares and returns a bio for indirect user io, bouncing data
* to/from kernel pages as necessary. Must be paired with
* call bio_uncopy_user() on io completion.
*/
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
- unsigned int len, int write_to_vm)
+struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
+ int iov_count, int write_to_vm)
{
- unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = uaddr >> PAGE_SHIFT;
struct bio_map_data *bmd;
struct bio_vec *bvec;
struct page *page;
struct bio *bio;
int i, ret;
+ int nr_pages = 0;
+ unsigned int len = 0;
- bmd = bio_alloc_map_data(end - start);
+ for (i = 0; i < iov_count; i++) {
+ unsigned long uaddr;
+ unsigned long end;
+ unsigned long start;
+
+ uaddr = (unsigned long)iov[i].iov_base;
+ end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = uaddr >> PAGE_SHIFT;
+
+ nr_pages += end - start;
+ len += iov[i].iov_len;
+ }
+
+ bmd = bio_alloc_map_data(nr_pages, iov_count);
if (!bmd)
return ERR_PTR(-ENOMEM);
- bmd->userptr = (void __user *) uaddr;
-
ret = -ENOMEM;
- bio = bio_alloc(GFP_KERNEL, end - start);
+ bio = bio_alloc(GFP_KERNEL, nr_pages);
if (!bio)
goto out_bmd;
@@ -564,22 +628,12 @@ struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
* success
*/
if (!write_to_vm) {
- char __user *p = (char __user *) uaddr;
-
- /*
- * for a write, copy in data to kernel pages
- */
- ret = -EFAULT;
- bio_for_each_segment(bvec, bio, i) {
- char *addr = page_address(bvec->bv_page);
-
- if (copy_from_user(addr, p, bvec->bv_len))
- goto cleanup;
- p += bvec->bv_len;
- }
+ ret = __bio_copy_iov(bio, iov, iov_count, 0);
+ if (ret)
+ goto cleanup;
}
- bio_set_map_data(bmd, bio);
+ bio_set_map_data(bmd, bio, iov, iov_count);
return bio;
cleanup:
bio_for_each_segment(bvec, bio, i)
@@ -591,6 +645,28 @@ out_bmd:
return ERR_PTR(ret);
}
+/**
+ * bio_copy_user - copy user data to bio
+ * @q: destination block queue
+ * @uaddr: start of user address
+ * @len: length in bytes
+ * @write_to_vm: bool indicating writing to pages or not
+ *
+ * Prepares and returns a bio for indirect user io, bouncing data
+ * to/from kernel pages as necessary. Must be paired with
+ * call bio_uncopy_user() on io completion.
+ */
+struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
+ unsigned int len, int write_to_vm)
+{
+ struct sg_iovec iov;
+
+ iov.iov_base = (void __user *)uaddr;
+ iov.iov_len = len;
+
+ return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+}
+
static struct bio *__bio_map_user_iov(struct request_queue *q,
struct block_device *bdev,
struct sg_iovec *iov, int iov_count,
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 350680fd7da..0c3b618c15b 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -23,7 +23,6 @@
#include <linux/buffer_head.h>
#include <linux/vfs.h>
#include <linux/mutex.h>
-#include <asm/semaphore.h>
#include <asm/uaccess.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 43455776711..3ee588d5f58 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1746,12 +1746,21 @@ shouldnt_be_hashed:
goto shouldnt_be_hashed;
}
+static int prepend(char **buffer, int *buflen, const char *str,
+ int namelen)
+{
+ *buflen -= namelen;
+ if (*buflen < 0)
+ return -ENAMETOOLONG;
+ *buffer -= namelen;
+ memcpy(*buffer, str, namelen);
+ return 0;
+}
+
/**
* d_path - return the path of a dentry
- * @dentry: dentry to report
- * @vfsmnt: vfsmnt to which the dentry belongs
- * @root: root dentry
- * @rootmnt: vfsmnt to which the root dentry belongs
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry (may be modified by this function)
* @buffer: buffer to return value in
* @buflen: buffer length
*
@@ -1761,23 +1770,22 @@ shouldnt_be_hashed:
* Returns the buffer or an error code if the path was too long.
*
* "buflen" should be positive. Caller holds the dcache_lock.
+ *
+ * If path is not reachable from the supplied root, then the value of
+ * root is changed (without modifying refcounts).
*/
-static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
- struct path *root, char *buffer, int buflen)
+char *__d_path(const struct path *path, struct path *root,
+ char *buffer, int buflen)
{
+ struct dentry *dentry = path->dentry;
+ struct vfsmount *vfsmnt = path->mnt;
char * end = buffer+buflen;
char * retval;
- int namelen;
-
- *--end = '\0';
- buflen--;
- if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
- buflen -= 10;
- end -= 10;
- if (buflen < 0)
+
+ prepend(&end, &buflen, "\0", 1);
+ if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+ (prepend(&end, &buflen, " (deleted)", 10) != 0))
goto Elong;
- memcpy(end, " (deleted)", 10);
- }
if (buflen < 1)
goto Elong;
@@ -1804,13 +1812,10 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
}
parent = dentry->d_parent;
prefetch(parent);
- namelen = dentry->d_name.len;
- buflen -= namelen + 1;
- if (buflen < 0)
+ if ((prepend(&end, &buflen, dentry->d_name.name,
+ dentry->d_name.len) != 0) ||
+ (prepend(&end, &buflen, "/", 1) != 0))
goto Elong;
- end -= namelen;
- memcpy(end, dentry->d_name.name, namelen);
- *--end = '/';
retval = end;
dentry = parent;
}
@@ -1818,12 +1823,12 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
return retval;
global_root:
- namelen = dentry->d_name.len;
- buflen -= namelen;
- if (buflen < 0)
+ retval += 1; /* hit the slash */
+ if (prepend(&retval, &buflen, dentry->d_name.name,
+ dentry->d_name.len) != 0)
goto Elong;
- retval -= namelen-1; /* hit the slash */
- memcpy(retval, dentry->d_name.name, namelen);
+ root->mnt = vfsmnt;
+ root->dentry = dentry;
return retval;
Elong:
return ERR_PTR(-ENAMETOOLONG);
@@ -1846,6 +1851,7 @@ char *d_path(struct path *path, char *buf, int buflen)
{
char *res;
struct path root;
+ struct path tmp;
/*
* We have various synthetic filesystems that never get mounted. On
@@ -1859,10 +1865,11 @@ char *d_path(struct path *path, char *buf, int buflen)
read_lock(&current->fs->lock);
root = current->fs->root;
- path_get(&current->fs->root);
+ path_get(&root);
read_unlock(&current->fs->lock);
spin_lock(&dcache_lock);
- res = __d_path(path->dentry, path->mnt, &root, buf, buflen);
+ tmp = root;
+ res = __d_path(path, &tmp, buf, buflen);
spin_unlock(&dcache_lock);
path_put(&root);
return res;
@@ -1890,6 +1897,48 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
}
/*
+ * Write full pathname from the root of the filesystem into the buffer.
+ */
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+ char *end = buf + buflen;
+ char *retval;
+
+ spin_lock(&dcache_lock);
+ prepend(&end, &buflen, "\0", 1);
+ if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+ (prepend(&end, &buflen, "//deleted", 9) != 0))
+ goto Elong;
+ if (buflen < 1)
+ goto Elong;
+ /* Get '/' right */
+ retval = end-1;
+ *retval = '/';
+
+ for (;;) {
+ struct dentry *parent;
+ if (IS_ROOT(dentry))
+ break;
+
+ parent = dentry->d_parent;
+ prefetch(parent);
+
+ if ((prepend(&end, &buflen, dentry->d_name.name,
+ dentry->d_name.len) != 0) ||
+ (prepend(&end, &buflen, "/", 1) != 0))
+ goto Elong;
+
+ retval = end;
+ dentry = parent;
+ }
+ spin_unlock(&dcache_lock);
+ return retval;
+Elong:
+ spin_unlock(&dcache_lock);
+ return ERR_PTR(-ENAMETOOLONG);
+}
+
+/*
* NOTE! The user-level library version returns a
* character pointer. The kernel system call just
* returns the length of the buffer filled (which
@@ -1918,9 +1967,9 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
read_lock(&current->fs->lock);
pwd = current->fs->pwd;
- path_get(&current->fs->pwd);
+ path_get(&pwd);
root = current->fs->root;
- path_get(&current->fs->root);
+ path_get(&root);
read_unlock(&current->fs->lock);
error = -ENOENT;
@@ -1928,9 +1977,10 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
spin_lock(&dcache_lock);
if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) {
unsigned long len;
+ struct path tmp = root;
char * cwd;
- cwd = __d_path(pwd.dentry, pwd.mnt, &root, page, PAGE_SIZE);
+ cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE);
spin_unlock(&dcache_lock);
error = PTR_ERR(cwd);
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index d248e60951b..ca1c9124c8c 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -10,6 +10,7 @@ dlm-y := ast.o \
midcomms.o \
netlink.o \
lowcomms.o \
+ plock.o \
rcom.o \
recover.o \
recoverd.o \
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index c3ad1dff3b2..eac23bd288b 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -114,7 +114,7 @@ struct cluster_attribute {
};
static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
- unsigned int *info_field, int check_zero,
+ int *info_field, int check_zero,
const char *buf, size_t len)
{
unsigned int x;
@@ -284,6 +284,7 @@ struct node {
struct list_head list; /* space->members */
int nodeid;
int weight;
+ int new;
};
static struct configfs_group_operations clusters_ops = {
@@ -565,6 +566,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
config_item_init_type_name(&nd->item, name, &node_type);
nd->nodeid = -1;
nd->weight = 1; /* default weight of 1 if none is set */
+ nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */
mutex_lock(&sp->members_lock);
list_add(&nd->list, &sp->members);
@@ -805,12 +807,13 @@ static void put_comm(struct comm *cm)
}
/* caller must free mem */
-int dlm_nodeid_list(char *lsname, int **ids_out)
+int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
+ int **new_out, int *new_count_out)
{
struct space *sp;
struct node *nd;
- int i = 0, rv = 0;
- int *ids;
+ int i = 0, rv = 0, ids_count = 0, new_count = 0;
+ int *ids, *new;
sp = get_space(lsname);
if (!sp)
@@ -818,23 +821,50 @@ int dlm_nodeid_list(char *lsname, int **ids_out)
mutex_lock(&sp->members_lock);
if (!sp->members_count) {
- rv = 0;
+ rv = -EINVAL;
+ printk(KERN_ERR "dlm: zero members_count\n");
goto out;
}
- ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
+ ids_count = sp->members_count;
+
+ ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL);
if (!ids) {
rv = -ENOMEM;
goto out;
}
- rv = sp->members_count;
- list_for_each_entry(nd, &sp->members, list)
+ list_for_each_entry(nd, &sp->members, list) {
ids[i++] = nd->nodeid;
+ if (nd->new)
+ new_count++;
+ }
+
+ if (ids_count != i)
+ printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
+
+ if (!new_count)
+ goto out_ids;
+
+ new = kcalloc(new_count, sizeof(int), GFP_KERNEL);
+ if (!new) {
+ kfree(ids);
+ rv = -ENOMEM;
+ goto out;
+ }
- if (rv != i)
- printk("bad nodeid count %d %d\n", rv, i);
+ i = 0;
+ list_for_each_entry(nd, &sp->members, list) {
+ if (nd->new) {
+ new[i++] = nd->nodeid;
+ nd->new = 0;
+ }
+ }
+ *new_count_out = new_count;
+ *new_out = new;
+ out_ids:
+ *ids_count_out = ids_count;
*ids_out = ids;
out:
mutex_unlock(&sp->members_lock);
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index a3170fe2209..4f1d6fce58c 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -35,7 +35,8 @@ extern struct dlm_config_info dlm_config;
int dlm_config_init(void);
void dlm_config_exit(void);
int dlm_node_weight(char *lsname, int nodeid);
-int dlm_nodeid_list(char *lsname, int **ids_out);
+int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
+ int **new_out, int *new_count_out);
int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
int dlm_our_nodeid(void);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d30ea8b433a..5a7ac33b629 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -37,14 +37,11 @@
#include <linux/jhash.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
-#include <asm/semaphore.h>
#include <asm/uaccess.h>
#include <linux/dlm.h>
#include "config.h"
-#define DLM_LOCKSPACE_LEN 64
-
/* Size of the temp buffer midcomms allocates on the stack.
We try to make this large enough so most messages fit.
FIXME: should sctp make this unnecessary? */
@@ -133,8 +130,10 @@ struct dlm_member {
struct dlm_recover {
struct list_head list;
- int *nodeids;
+ int *nodeids; /* nodeids of all members */
int node_count;
+ int *new; /* nodeids of new members */
+ int new_count;
uint64_t seq;
};
@@ -580,6 +579,8 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
int dlm_netlink_init(void);
void dlm_netlink_exit(void);
void dlm_timeout_warn(struct dlm_lkb *lkb);
+int dlm_plock_init(void);
+void dlm_plock_exit(void);
#ifdef CONFIG_DLM_DEBUG
int dlm_register_debugfs(void);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 8f250ac8b92..2d3d1027ce2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -165,7 +165,7 @@ void dlm_print_lkb(struct dlm_lkb *lkb)
lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
}
-void dlm_print_rsb(struct dlm_rsb *r)
+static void dlm_print_rsb(struct dlm_rsb *r)
{
printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
r->res_nodeid, r->res_flags, r->res_first_lkid,
@@ -1956,8 +1956,7 @@ static void confirm_master(struct dlm_rsb *r, int error)
list_del_init(&lkb->lkb_rsb_lookup);
r->res_first_lkid = lkb->lkb_id;
_request_lock(r, lkb);
- } else
- r->res_nodeid = -1;
+ }
break;
default:
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 05d9c82e646..88e93c80cc2 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -13,7 +13,6 @@
#ifndef __LOCK_DOT_H__
#define __LOCK_DOT_H__
-void dlm_print_rsb(struct dlm_rsb *r);
void dlm_dump_rsb(struct dlm_rsb *r);
void dlm_print_lkb(struct dlm_lkb *lkb);
void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 58487fb95a4..b80e0aa3cfa 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -46,10 +46,16 @@ static int __init init_dlm(void)
if (error)
goto out_user;
+ error = dlm_plock_init();
+ if (error)
+ goto out_netlink;
+
printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
return 0;
+ out_netlink:
+ dlm_netlink_exit();
out_user:
dlm_user_exit();
out_debug:
@@ -66,6 +72,7 @@ static int __init init_dlm(void)
static void __exit exit_dlm(void)
{
+ dlm_plock_exit();
dlm_netlink_exit();
dlm_user_exit();
dlm_config_exit();
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index fa17f5a2788..26133f05ae3 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -210,6 +210,23 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
}
}
+ /* Add an entry to ls_nodes_gone for members that were removed and
+ then added again, so that previous state for these nodes will be
+ cleared during recovery. */
+
+ for (i = 0; i < rv->new_count; i++) {
+ if (!dlm_is_member(ls, rv->new[i]))
+ continue;
+ log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
+
+ memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+ if (!memb)
+ return -ENOMEM;
+ memb->nodeid = rv->new[i];
+ list_add_tail(&memb->list, &ls->ls_nodes_gone);
+ neg++;
+ }
+
/* add new members to ls_nodes */
for (i = 0; i < rv->node_count; i++) {
@@ -314,15 +331,16 @@ int dlm_ls_stop(struct dlm_ls *ls)
int dlm_ls_start(struct dlm_ls *ls)
{
struct dlm_recover *rv = NULL, *rv_old;
- int *ids = NULL;
- int error, count;
+ int *ids = NULL, *new = NULL;
+ int error, ids_count = 0, new_count = 0;
rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
if (!rv)
return -ENOMEM;
- error = count = dlm_nodeid_list(ls->ls_name, &ids);
- if (error <= 0)
+ error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count,
+ &new, &new_count);
+ if (error < 0)
goto fail;
spin_lock(&ls->ls_recover_lock);
@@ -337,14 +355,19 @@ int dlm_ls_start(struct dlm_ls *ls)
}
rv->nodeids = ids;
- rv->node_count = count;
+ rv->node_count = ids_count;
+ rv->new = new;
+ rv->new_count = new_count;
rv->seq = ++ls->ls_recover_seq;
rv_old = ls->ls_recover_args;
ls->ls_recover_args = rv;
spin_unlock(&ls->ls_recover_lock);
if (rv_old) {
+ log_error(ls, "unused recovery %llx %d",
+ (unsigned long long)rv_old->seq, rv_old->node_count);
kfree(rv_old->nodeids);
+ kfree(rv_old->new);
kfree(rv_old);
}
@@ -354,6 +377,7 @@ int dlm_ls_start(struct dlm_ls *ls)
fail:
kfree(rv);
kfree(ids);
+ kfree(new);
return error;
}
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/dlm/plock.c
index 2ebd374b314..d6d6e370f89 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -1,17 +1,19 @@
/*
- * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU General Public License version 2.
*/
+#include <linux/fs.h>
#include <linux/miscdevice.h>
-#include <linux/lock_dlm_plock.h>
#include <linux/poll.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
-#include "lock_dlm.h"
-
+#include "dlm_internal.h"
+#include "lockspace.h"
static spinlock_t ops_lock;
static struct list_head send_list;
@@ -22,7 +24,7 @@ static wait_queue_head_t recv_wq;
struct plock_op {
struct list_head list;
int done;
- struct gdlm_plock_info info;
+ struct dlm_plock_info info;
};
struct plock_xop {
@@ -34,22 +36,22 @@ struct plock_xop {
};
-static inline void set_version(struct gdlm_plock_info *info)
+static inline void set_version(struct dlm_plock_info *info)
{
- info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
- info->version[1] = GDLM_PLOCK_VERSION_MINOR;
- info->version[2] = GDLM_PLOCK_VERSION_PATCH;
+ info->version[0] = DLM_PLOCK_VERSION_MAJOR;
+ info->version[1] = DLM_PLOCK_VERSION_MINOR;
+ info->version[2] = DLM_PLOCK_VERSION_PATCH;
}
-static int check_version(struct gdlm_plock_info *info)
+static int check_version(struct dlm_plock_info *info)
{
- if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
- (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
- log_error("plock device version mismatch: "
+ if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
+ (DLM_PLOCK_VERSION_MINOR < info->version[1])) {
+ log_print("plock device version mismatch: "
"kernel (%u.%u.%u), user (%u.%u.%u)",
- GDLM_PLOCK_VERSION_MAJOR,
- GDLM_PLOCK_VERSION_MINOR,
- GDLM_PLOCK_VERSION_PATCH,
+ DLM_PLOCK_VERSION_MAJOR,
+ DLM_PLOCK_VERSION_MINOR,
+ DLM_PLOCK_VERSION_PATCH,
info->version[0],
info->version[1],
info->version[2]);
@@ -68,25 +70,31 @@ static void send_op(struct plock_op *op)
wake_up(&send_wq);
}
-int gdlm_plock(void *lockspace, struct lm_lockname *name,
- struct file *file, int cmd, struct file_lock *fl)
+int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+ int cmd, struct file_lock *fl)
{
- struct gdlm_ls *ls = lockspace;
+ struct dlm_ls *ls;
struct plock_op *op;
struct plock_xop *xop;
int rv;
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
xop = kzalloc(sizeof(*xop), GFP_KERNEL);
- if (!xop)
- return -ENOMEM;
+ if (!xop) {
+ rv = -ENOMEM;
+ goto out;
+ }
op = &xop->xop;
- op->info.optype = GDLM_PLOCK_OP_LOCK;
+ op->info.optype = DLM_PLOCK_OP_LOCK;
op->info.pid = fl->fl_pid;
op->info.ex = (fl->fl_type == F_WRLCK);
op->info.wait = IS_SETLKW(cmd);
- op->info.fsid = ls->id;
- op->info.number = name->ln_number;
+ op->info.fsid = ls->ls_global_id;
+ op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
@@ -107,12 +115,15 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
if (xop->callback == NULL)
wait_event(recv_wq, (op->done != 0));
- else
- return -EINPROGRESS;
+ else {
+ rv = -EINPROGRESS;
+ goto out;
+ }
spin_lock(&ops_lock);
if (!list_empty(&op->list)) {
- printk(KERN_INFO "plock op on list\n");
+ log_error(ls, "dlm_posix_lock: op on list %llx",
+ (unsigned long long)number);
list_del(&op->list);
}
spin_unlock(&ops_lock);
@@ -121,17 +132,19 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
if (!rv) {
if (posix_lock_file_wait(file, fl) < 0)
- log_error("gdlm_plock: vfs lock error %x,%llx",
- name->ln_type,
- (unsigned long long)name->ln_number);
+ log_error(ls, "dlm_posix_lock: vfs lock error %llx",
+ (unsigned long long)number);
}
kfree(xop);
+out:
+ dlm_put_lockspace(ls);
return rv;
}
+EXPORT_SYMBOL_GPL(dlm_posix_lock);
/* Returns failure iff a succesful lock operation should be canceled */
-static int gdlm_plock_callback(struct plock_op *op)
+static int dlm_plock_callback(struct plock_op *op)
{
struct file *file;
struct file_lock *fl;
@@ -142,7 +155,8 @@ static int gdlm_plock_callback(struct plock_op *op)
spin_lock(&ops_lock);
if (!list_empty(&op->list)) {
- printk(KERN_INFO "plock op on list\n");
+ log_print("dlm_plock_callback: op on list %llx",
+ (unsigned long long)op->info.number);
list_del(&op->list);
}
spin_unlock(&ops_lock);
@@ -165,19 +179,19 @@ static int gdlm_plock_callback(struct plock_op *op)
* This can only happen in the case of kmalloc() failure.
* The filesystem's own lock is the authoritative lock,
* so a failure to get the lock locally is not a disaster.
- * As long as GFS cannot reliably cancel locks (especially
+ * As long as the fs cannot reliably cancel locks (especially
* in a low-memory situation), we're better off ignoring
* this failure than trying to recover.
*/
- log_error("gdlm_plock: vfs lock error file %p fl %p",
- file, fl);
+ log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p",
+ (unsigned long long)op->info.number, file, fl);
}
rv = notify(flc, NULL, 0);
if (rv) {
/* XXX: We need to cancel the fs lock here: */
- printk("gfs2 lock granted after lock request failed;"
- " dangling lock!\n");
+ log_print("dlm_plock_callback: lock granted after lock request "
+ "failed; dangling lock!\n");
goto out;
}
@@ -186,25 +200,31 @@ out:
return rv;
}
-int gdlm_punlock(void *lockspace, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
+int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+ struct file_lock *fl)
{
- struct gdlm_ls *ls = lockspace;
+ struct dlm_ls *ls;
struct plock_op *op;
int rv;
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op)
- return -ENOMEM;
+ if (!op) {
+ rv = -ENOMEM;
+ goto out;
+ }
if (posix_lock_file_wait(file, fl) < 0)
- log_error("gdlm_punlock: vfs unlock error %x,%llx",
- name->ln_type, (unsigned long long)name->ln_number);
+ log_error(ls, "dlm_posix_unlock: vfs unlock error %llx",
+ (unsigned long long)number);
- op->info.optype = GDLM_PLOCK_OP_UNLOCK;
+ op->info.optype = DLM_PLOCK_OP_UNLOCK;
op->info.pid = fl->fl_pid;
- op->info.fsid = ls->id;
- op->info.number = name->ln_number;
+ op->info.fsid = ls->ls_global_id;
+ op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
if (fl->fl_lmops && fl->fl_lmops->fl_grant)
@@ -217,7 +237,8 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
spin_lock(&ops_lock);
if (!list_empty(&op->list)) {
- printk(KERN_INFO "punlock op on list\n");
+ log_error(ls, "dlm_posix_unlock: op on list %llx",
+ (unsigned long long)number);
list_del(&op->list);
}
spin_unlock(&ops_lock);
@@ -228,25 +249,34 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
rv = 0;
kfree(op);
+out:
+ dlm_put_lockspace(ls);
return rv;
}
+EXPORT_SYMBOL_GPL(dlm_posix_unlock);
-int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
+int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+ struct file_lock *fl)
{
- struct gdlm_ls *ls = lockspace;
+ struct dlm_ls *ls;
struct plock_op *op;
int rv;
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op)
- return -ENOMEM;
+ if (!op) {
+ rv = -ENOMEM;
+ goto out;
+ }
- op->info.optype = GDLM_PLOCK_OP_GET;
+ op->info.optype = DLM_PLOCK_OP_GET;
op->info.pid = fl->fl_pid;
op->info.ex = (fl->fl_type == F_WRLCK);
- op->info.fsid = ls->id;
- op->info.number = name->ln_number;
+ op->info.fsid = ls->ls_global_id;
+ op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
if (fl->fl_lmops && fl->fl_lmops->fl_grant)
@@ -259,7 +289,8 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
spin_lock(&ops_lock);
if (!list_empty(&op->list)) {
- printk(KERN_INFO "plock_get op on list\n");
+ log_error(ls, "dlm_posix_get: op on list %llx",
+ (unsigned long long)number);
list_del(&op->list);
}
spin_unlock(&ops_lock);
@@ -281,14 +312,17 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
}
kfree(op);
+out:
+ dlm_put_lockspace(ls);
return rv;
}
+EXPORT_SYMBOL_GPL(dlm_posix_get);
/* a read copies out one plock request from the send list */
static ssize_t dev_read(struct file *file, char __user *u, size_t count,
loff_t *ppos)
{
- struct gdlm_plock_info info;
+ struct dlm_plock_info info;
struct plock_op *op = NULL;
if (count < sizeof(info))
@@ -315,7 +349,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
loff_t *ppos)
{
- struct gdlm_plock_info info;
+ struct dlm_plock_info info;
struct plock_op *op;
int found = 0;
@@ -345,12 +379,12 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
struct plock_xop *xop;
xop = (struct plock_xop *)op;
if (xop->callback)
- count = gdlm_plock_callback(op);
+ count = dlm_plock_callback(op);
else
wake_up(&recv_wq);
} else
- printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
- (unsigned long long)info.number);
+ log_print("dev_write no op %x %llx", info.fsid,
+ (unsigned long long)info.number);
return count;
}
@@ -377,11 +411,11 @@ static const struct file_operations dev_fops = {
static struct miscdevice plock_dev_misc = {
.minor = MISC_DYNAMIC_MINOR,
- .name = GDLM_PLOCK_MISC_NAME,
+ .name = DLM_PLOCK_MISC_NAME,
.fops = &dev_fops
};
-int gdlm_plock_init(void)
+int dlm_plock_init(void)
{
int rv;
@@ -393,14 +427,13 @@ int gdlm_plock_init(void)
rv = misc_register(&plock_dev_misc);
if (rv)
- printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
- rv);
+ log_print("dlm_plock_init: misc_register failed %d", rv);
return rv;
}
-void gdlm_plock_exit(void)
+void dlm_plock_exit(void)
{
if (misc_deregister(&plock_dev_misc) < 0)
- printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
+ log_print("dlm_plock_exit: misc_deregister failed");
}
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 997f9531d59..fd677c8c3d3 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -257,6 +257,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
if (rv) {
ls_recover(ls, rv);
kfree(rv->nodeids);
+ kfree(rv->new);
kfree(rv);
}
}
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5deb8b74e64..08f647d8188 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -253,7 +253,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
* it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
+ * Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
* free inodes than average (starting at parent's group).
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c6200680542..b8a2990bab8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -239,7 +239,7 @@ no_block:
* @inode: owner
* @ind: descriptor of indirect block.
*
- * This function returns the prefered place for block allocation.
+ * This function returns the preferred place for block allocation.
* It is used when heuristic for sequential allocation fails.
* Rules are:
* + if there is a block to the left of our position - allocate near it.
@@ -283,7 +283,7 @@ static unsigned long ext2_find_near(struct inode *inode, Indirect *ind)
}
/**
- * ext2_find_goal - find a prefered place for allocation.
+ * ext2_find_goal - find a preferred place for allocation.
* @inode: owner
* @block: block we want
* @partial: pointer to the last triple within a chain
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index b8ea11fee5c..de876fa793e 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -12,6 +12,7 @@
#include <linux/time.h>
#include <linux/sched.h>
#include <linux/compat.h>
+#include <linux/mount.h>
#include <linux/smp_lock.h>
#include <asm/current.h>
#include <asm/uaccess.h>
@@ -23,6 +24,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
struct ext2_inode_info *ei = EXT2_I(inode);
unsigned int flags;
unsigned short rsv_window_size;
+ int ret;
ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg);
@@ -34,14 +36,19 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case EXT2_IOC_SETFLAGS: {
unsigned int oldflags;
- if (IS_RDONLY(inode))
- return -EROFS;
+ ret = mnt_want_write(filp->f_path.mnt);
+ if (ret)
+ return ret;
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ if (!is_owner_or_cap(inode)) {
+ ret = -EACCES;
+ goto setflags_out;
+ }
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (get_user(flags, (int __user *) arg)) {
+ ret = -EFAULT;
+ goto setflags_out;
+ }
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT2_DIRSYNC_FL;
@@ -50,7 +57,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ ret = -EPERM;
+ goto setflags_out;
}
oldflags = ei->i_flags;
@@ -63,7 +71,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ ret = -EPERM;
+ goto setflags_out;
}
}
@@ -75,20 +84,26 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ext2_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return ret;
}
case EXT2_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *) arg);
case EXT2_IOC_SETVERSION:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(inode->i_generation, (int __user *) arg))
- return -EFAULT;
- inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
- return 0;
+ ret = mnt_want_write(filp->f_path.mnt);
+ if (ret)
+ return ret;
+ if (get_user(inode->i_generation, (int __user *) arg)) {
+ ret = -EFAULT;
+ } else {
+ inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+ }
+ mnt_drop_write(filp->f_path.mnt);
+ return ret;
case EXT2_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION)
&& S_ISREG(inode->i_mode)
@@ -102,15 +117,16 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ if (!is_owner_or_cap(inode))
return -EACCES;
if (get_user(rsv_window_size, (int __user *)arg))
return -EFAULT;
+ ret = mnt_want_write(filp->f_path.mnt);
+ if (ret)
+ return ret;
+
if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT2_MAX_RESERVE_BLOCKS;
@@ -131,6 +147,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
rsv->rsv_goal_size = rsv_window_size;
}
mutex_unlock(&ei->truncate_mutex);
+ mnt_drop_write(filp->f_path.mnt);
return 0;
}
default:
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4f4020c5468..96dd5573e49 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -239,7 +239,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
* it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
+ * Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
* free inodes than average (starting at parent's group).
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index eb95670a27e..c683609b0e3 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -392,7 +392,7 @@ no_block:
* @inode: owner
* @ind: descriptor of indirect block.
*
- * This function returns the prefered place for block allocation.
+ * This function returns the preferred place for block allocation.
* It is used when heuristic for sequential allocation fails.
* Rules are:
* + if there is a block to the left of our position - allocate near it.
@@ -436,12 +436,12 @@ static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
}
/**
- * ext3_find_goal - find a prefered place for allocation.
+ * ext3_find_goal - find a preferred place for allocation.
* @inode: owner
* @block: block we want
* @partial: pointer to the last triple within a chain
*
- * Normally this function find the prefered place for block allocation,
+ * Normally this function find the preferred place for block allocation,
* returns it.
*/
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 023a070f55f..0d0c7015164 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -12,6 +12,7 @@
#include <linux/capability.h>
#include <linux/ext3_fs.h>
#include <linux/ext3_jbd.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/compat.h>
#include <linux/smp_lock.h>
@@ -38,14 +39,19 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
unsigned int oldflags;
unsigned int jflag;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto flags_out;
+ }
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (get_user(flags, (int __user *) arg)) {
+ err = -EFAULT;
+ goto flags_out;
+ }
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT3_DIRSYNC_FL;
@@ -54,7 +60,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
oldflags = ei->i_flags;
@@ -70,7 +77,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
}
@@ -81,7 +89,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
}
@@ -89,7 +98,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
handle = ext3_journal_start(inode, 1);
if (IS_ERR(handle)) {
mutex_unlock(&inode->i_mutex);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto flags_out;
}
if (IS_SYNC(inode))
handle->h_sync = 1;
@@ -115,6 +125,8 @@ flags_err:
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
err = ext3_change_inode_journal_flag(inode, jflag);
mutex_unlock(&inode->i_mutex);
+flags_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
case EXT3_IOC_GETVERSION:
@@ -129,14 +141,18 @@ flags_err:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(generation, (int __user *) arg))
- return -EFAULT;
-
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto setversion_out;
+ }
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = CURRENT_TIME_SEC;
@@ -144,6 +160,8 @@ flags_err:
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
}
ext3_journal_stop(handle);
+setversion_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
#ifdef CONFIG_JBD_DEBUG
@@ -179,18 +197,24 @@ flags_err:
}
return -ENOTTY;
case EXT3_IOC_SETRSVSZ: {
+ int err;
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setrsvsz_out;
+ }
- if (get_user(rsv_window_size, (int __user *)arg))
- return -EFAULT;
+ if (get_user(rsv_window_size, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setrsvsz_out;
+ }
if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
@@ -208,7 +232,9 @@ flags_err:
rsv->rsv_goal_size = rsv_window_size;
}
mutex_unlock(&ei->truncate_mutex);
- return 0;
+setrsvsz_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
}
case EXT3_IOC_GROUP_EXTEND: {
ext3_fsblk_t n_blocks_count;
@@ -218,17 +244,20 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (get_user(n_blocks_count, (__u32 __user *)arg))
- return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
journal_lock_updates(EXT3_SB(sb)->s_journal);
journal_flush(EXT3_SB(sb)->s_journal);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+group_extend_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
case EXT3_IOC_GROUP_ADD: {
@@ -239,18 +268,22 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
- sizeof(input)))
- return -EFAULT;
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
err = ext3_group_add(sb, &input);
journal_lock_updates(EXT3_SB(sb)->s_journal);
journal_flush(EXT3_SB(sb)->s_journal);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+group_add_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8036b9b5376..486e46a3918 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -305,7 +305,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
* it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
+ * Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
* free inodes than average (starting at parent's group).
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 945cbf6cb1f..8fab233cb05 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -382,7 +382,7 @@ no_block:
* @inode: owner
* @ind: descriptor of indirect block.
*
- * This function returns the prefered place for block allocation.
+ * This function returns the preferred place for block allocation.
* It is used when heuristic for sequential allocation fails.
* Rules are:
* + if there is a block to the left of our position - allocate near it.
@@ -432,12 +432,12 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
}
/**
- * ext4_find_goal - find a prefered place for allocation.
+ * ext4_find_goal - find a preferred place for allocation.
* @inode: owner
* @block: block we want
* @partial: pointer to the last triple within a chain
*
- * Normally this function find the prefered place for block allocation,
+ * Normally this function find the preferred place for block allocation,
* returns it.
*/
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2ed7c37f897..25b13ede808 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -15,6 +15,7 @@
#include <linux/time.h>
#include <linux/compat.h>
#include <linux/smp_lock.h>
+#include <linux/mount.h>
#include <asm/uaccess.h>
int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
@@ -38,24 +39,25 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
unsigned int oldflags;
unsigned int jflag;
- if (IS_RDONLY(inode))
- return -EROFS;
-
if (!is_owner_or_cap(inode))
return -EACCES;
if (get_user(flags, (int __user *) arg))
return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT4_DIRSYNC_FL;
+ err = -EPERM;
mutex_lock(&inode->i_mutex);
/* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode)) {
- mutex_unlock(&inode->i_mutex);
- return -EPERM;
- }
+ if (IS_NOQUOTA(inode))
+ goto flags_out;
+
oldflags = ei->i_flags;
/* The JOURNAL_DATA flag is modifiable only by root */
@@ -68,10 +70,8 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* This test looks nicer. Thanks to Pauline Middelink
*/
if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
- return -EPERM;
- }
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ goto flags_out;
}
/*
@@ -79,17 +79,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* the relevant capability.
*/
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE)) {
- mutex_unlock(&inode->i_mutex);
- return -EPERM;
- }
+ if (!capable(CAP_SYS_RESOURCE))
+ goto flags_out;
}
-
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
- mutex_unlock(&inode->i_mutex);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto flags_out;
}
if (IS_SYNC(inode))
handle->h_sync = 1;
@@ -107,14 +104,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
ext4_journal_stop(handle);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- return err;
- }
+ if (err)
+ goto flags_out;
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
err = ext4_change_inode_journal_flag(inode, jflag);
+flags_out:
mutex_unlock(&inode->i_mutex);
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
case EXT4_IOC_GETVERSION:
@@ -129,14 +126,20 @@ flags_err:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(generation, (int __user *) arg))
- return -EFAULT;
+
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
handle = ext4_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto setversion_out;
+ }
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = ext4_current_time(inode);
@@ -144,6 +147,8 @@ flags_err:
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
ext4_journal_stop(handle);
+setversion_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
#ifdef CONFIG_JBD2_DEBUG
@@ -179,19 +184,21 @@ flags_err:
}
return -ENOTTY;
case EXT4_IOC_SETRSVSZ: {
+ int err;
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
-
if (!is_owner_or_cap(inode))
return -EACCES;
if (get_user(rsv_window_size, (int __user *)arg))
return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
@@ -208,6 +215,7 @@ flags_err:
rsv->rsv_goal_size = rsv_window_size;
}
up_write(&ei->i_data_sem);
+ mnt_drop_write(filp->f_path.mnt);
return 0;
}
case EXT4_IOC_GROUP_EXTEND: {
@@ -218,16 +226,18 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
-
if (get_user(n_blocks_count, (__u32 __user *)arg))
return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
@@ -239,17 +249,19 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
-
if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
sizeof(input)))
return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
err = ext4_group_add(sb, &input);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index c614175876e..2a3bed96704 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
#include <linux/capability.h>
#include <linux/module.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/msdos_fs.h>
#include <linux/smp_lock.h>
@@ -46,10 +47,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
mutex_lock(&inode->i_mutex);
- if (IS_RDONLY(inode)) {
- err = -EROFS;
- goto up;
- }
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ goto up_no_drop_write;
/*
* ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -105,7 +105,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
mark_inode_dirty(inode);
- up:
+up:
+ mnt_drop_write(filp->f_path.mnt);
+up_no_drop_write:
mutex_unlock(&inode->i_mutex);
return err;
}
diff --git a/fs/file_table.c b/fs/file_table.c
index 986ff4ed0a7..7a0a9b87225 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -42,6 +42,7 @@ static inline void file_free_rcu(struct rcu_head *head)
static inline void file_free(struct file *f)
{
percpu_counter_dec(&nr_files);
+ file_check_state(f);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
@@ -199,6 +200,18 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
file->f_mapping = dentry->d_inode->i_mapping;
file->f_mode = mode;
file->f_op = fop;
+
+ /*
+ * These mounts don't really matter in practice
+ * for r/o bind mounts. They aren't userspace-
+ * visible. We do this for consistency, and so
+ * that we can do debugging checks at __fput()
+ */
+ if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+ file_take_write(file);
+ error = mnt_want_write(mnt);
+ WARN_ON(error);
+ }
return error;
}
EXPORT_SYMBOL(init_file);
@@ -211,6 +224,31 @@ void fput(struct file *file)
EXPORT_SYMBOL(fput);
+/**
+ * drop_file_write_access - give up ability to write to a file
+ * @file: the file to which we will stop writing
+ *
+ * This is a central place which will give up the ability
+ * to write to @file, along with access to write through
+ * its vfsmount.
+ */
+void drop_file_write_access(struct file *file)
+{
+ struct vfsmount *mnt = file->f_path.mnt;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+
+ put_write_access(inode);
+
+ if (special_file(inode->i_mode))
+ return;
+ if (file_check_writeable(file) != 0)
+ return;
+ mnt_drop_write(mnt);
+ file_release_write(file);
+}
+EXPORT_SYMBOL_GPL(drop_file_write_access);
+
/* __fput is called from task context when aio completion releases the last
* last use of a struct file *. Do not use otherwise.
*/
@@ -236,10 +274,10 @@ void __fput(struct file *file)
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
cdev_put(inode->i_cdev);
fops_put(file->f_op);
- if (file->f_mode & FMODE_WRITE)
- put_write_access(inode);
put_pid(file->f_owner.pid);
file_kill(file);
+ if (file->f_mode & FMODE_WRITE)
+ drop_file_write_access(file);
file->f_path.dentry = NULL;
file->f_path.mnt = NULL;
file_free(file);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index de8e64c03f7..7f7947e3dfb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
config GFS2_FS
tristate "GFS2 file system support"
- depends on EXPERIMENTAL
+ depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
select FS_POSIX_ACL
select CRC32
help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 8fff11058ce..e2350df02a0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
obj-$(CONFIG_GFS2_FS) += gfs2.o
gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
- glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
+ glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
ops_fstype.o ops_inode.o ops_super.o quota.o \
recovery.o rgrp.o super.o sys.o trans.o util.o
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1047a8c7226..3e9bd46f27e 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -116,7 +116,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
goto out;
er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
- er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+ er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
error = -ENOMEM;
if (!er.er_data)
goto out;
@@ -222,7 +222,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
return error;
}
- clone = posix_acl_clone(acl, GFP_KERNEL);
+ clone = posix_acl_clone(acl, GFP_NOFS);
error = -ENOMEM;
if (!clone)
goto out;
@@ -272,7 +272,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
if (!acl)
return gfs2_setattr_simple(ip, attr);
- clone = posix_acl_clone(acl, GFP_KERNEL);
+ clone = posix_acl_clone(acl, GFP_NOFS);
error = -ENOMEM;
if (!clone)
goto out;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e9456ebd3bb..c19184f2e70 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -33,6 +33,7 @@
* keep it small.
*/
struct metapath {
+ struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
__u16 mp_list[GFS2_MAX_META_HEIGHT];
};
@@ -135,9 +136,10 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
/* Get a free block, fill it with the stuffed data,
and write it out to disk */
+ unsigned int n = 1;
+ block = gfs2_alloc_block(ip, &n);
if (isdir) {
- block = gfs2_alloc_meta(ip);
-
+ gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
error = gfs2_dir_get_new_buffer(ip, block, &bh);
if (error)
goto out_brelse;
@@ -145,8 +147,6 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
dibh, sizeof(struct gfs2_dinode));
brelse(bh);
} else {
- block = gfs2_alloc_data(ip);
-
error = gfs2_unstuffer_page(ip, dibh, block, page);
if (error)
goto out_brelse;
@@ -161,12 +161,11 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
if (ip->i_di.di_size) {
*(__be64 *)(di + 1) = cpu_to_be64(block);
- ip->i_di.di_blocks++;
- gfs2_set_inode_blocks(&ip->i_inode);
- di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+ gfs2_add_inode_blocks(&ip->i_inode, 1);
+ di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
}
- ip->i_di.di_height = 1;
+ ip->i_height = 1;
di->di_height = cpu_to_be16(1);
out_brelse:
@@ -176,114 +175,13 @@ out:
return error;
}
-/**
- * calc_tree_height - Calculate the height of a metadata tree
- * @ip: The GFS2 inode
- * @size: The proposed size of the file
- *
- * Work out how tall a metadata tree needs to be in order to accommodate a
- * file of a particular size. If size is less than the current size of
- * the inode, then the current size of the inode is used instead of the
- * supplied one.
- *
- * Returns: the height the tree should be
- */
-
-static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- u64 *arr;
- unsigned int max, height;
-
- if (ip->i_di.di_size > size)
- size = ip->i_di.di_size;
-
- if (gfs2_is_dir(ip)) {
- arr = sdp->sd_jheightsize;
- max = sdp->sd_max_jheight;
- } else {
- arr = sdp->sd_heightsize;
- max = sdp->sd_max_height;
- }
-
- for (height = 0; height < max; height++)
- if (arr[height] >= size)
- break;
-
- return height;
-}
-
-/**
- * build_height - Build a metadata tree of the requested height
- * @ip: The GFS2 inode
- * @height: The height to build to
- *
- *
- * Returns: errno
- */
-
-static int build_height(struct inode *inode, unsigned height)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- unsigned new_height = height - ip->i_di.di_height;
- struct buffer_head *dibh;
- struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
- struct gfs2_dinode *di;
- int error;
- __be64 *bp;
- u64 bn;
- unsigned n;
-
- if (height <= ip->i_di.di_height)
- return 0;
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- return error;
-
- for(n = 0; n < new_height; n++) {
- bn = gfs2_alloc_meta(ip);
- blocks[n] = gfs2_meta_new(ip->i_gl, bn);
- gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
- }
-
- n = 0;
- bn = blocks[0]->b_blocknr;
- if (new_height > 1) {
- for(; n < new_height-1; n++) {
- gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
- GFS2_FORMAT_IN);
- gfs2_buffer_clear_tail(blocks[n],
- sizeof(struct gfs2_meta_header));
- bp = (__be64 *)(blocks[n]->b_data +
- sizeof(struct gfs2_meta_header));
- *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
- brelse(blocks[n]);
- blocks[n] = NULL;
- }
- }
- gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
- gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
- dibh, sizeof(struct gfs2_dinode));
- brelse(blocks[n]);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- di = (struct gfs2_dinode *)dibh->b_data;
- gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
- *(__be64 *)(di + 1) = cpu_to_be64(bn);
- ip->i_di.di_height += new_height;
- ip->i_di.di_blocks += new_height;
- gfs2_set_inode_blocks(&ip->i_inode);
- di->di_height = cpu_to_be16(ip->i_di.di_height);
- di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
- brelse(dibh);
- return error;
-}
/**
* find_metapath - Find path through the metadata tree
- * @ip: The inode pointer
+ * @sdp: The superblock
* @mp: The metapath to return the result in
* @block: The disk block to look up
+ * @height: The pre-calculated height of the metadata tree
*
* This routine returns a struct metapath structure that defines a path
* through the metadata of inode "ip" to get to block "block".
@@ -338,21 +236,29 @@ static int build_height(struct inode *inode, unsigned height)
*
*/
-static void find_metapath(struct gfs2_inode *ip, u64 block,
- struct metapath *mp)
+static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
+ struct metapath *mp, unsigned int height)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- u64 b = block;
unsigned int i;
- for (i = ip->i_di.di_height; i--;)
- mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
+ for (i = height; i--;)
+ mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
+
+}
+static inline unsigned int zero_metapath_length(const struct metapath *mp,
+ unsigned height)
+{
+ unsigned int i;
+ for (i = 0; i < height - 1; i++) {
+ if (mp->mp_list[i] != 0)
+ return i;
+ }
+ return height;
}
/**
* metapointer - Return pointer to start of metadata in a buffer
- * @bh: The buffer
* @height: The metadata height (0 = dinode)
* @mp: The metapath
*
@@ -361,93 +267,302 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
* metadata tree.
*/
-static inline __be64 *metapointer(struct buffer_head *bh, int *boundary,
- unsigned int height, const struct metapath *mp)
+static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
{
+ struct buffer_head *bh = mp->mp_bh[height];
unsigned int head_size = (height > 0) ?
sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
- __be64 *ptr;
- *boundary = 0;
- ptr = ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
- if (ptr + 1 == (__be64 *)(bh->b_data + bh->b_size))
- *boundary = 1;
- return ptr;
+ return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
}
/**
- * lookup_block - Get the next metadata block in metadata tree
- * @ip: The GFS2 inode
- * @bh: Buffer containing the pointers to metadata blocks
- * @height: The height of the tree (0 = dinode)
+ * lookup_metapath - Walk the metadata tree to a specific point
+ * @ip: The inode
* @mp: The metapath
- * @create: Non-zero if we may create a new meatdata block
- * @new: Used to indicate if we did create a new metadata block
- * @block: the returned disk block number
*
- * Given a metatree, complete to a particular height, checks to see if the next
- * height of the tree exists. If not the next height of the tree is created.
- * The block number of the next height of the metadata tree is returned.
+ * Assumes that the inode's buffer has already been looked up and
+ * hooked onto mp->mp_bh[0] and that the metapath has been initialised
+ * by find_metapath().
+ *
+ * If this function encounters part of the tree which has not been
+ * allocated, it returns the current height of the tree at the point
+ * at which it found the unallocated block. Blocks which are found are
+ * added to the mp->mp_bh[] list.
*
+ * Returns: error or height of metadata tree
*/
-static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
- unsigned int height, struct metapath *mp, int create,
- int *new, u64 *block)
+static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
{
- int boundary;
- __be64 *ptr = metapointer(bh, &boundary, height, mp);
+ unsigned int end_of_metadata = ip->i_height - 1;
+ unsigned int x;
+ __be64 *ptr;
+ u64 dblock;
+ int ret;
- if (*ptr) {
- *block = be64_to_cpu(*ptr);
- return boundary;
- }
+ for (x = 0; x < end_of_metadata; x++) {
+ ptr = metapointer(x, mp);
+ dblock = be64_to_cpu(*ptr);
+ if (!dblock)
+ return x + 1;
- *block = 0;
+ ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]);
+ if (ret)
+ return ret;
+ }
- if (!create)
- return 0;
+ return ip->i_height;
+}
- if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
- *block = gfs2_alloc_data(ip);
- else
- *block = gfs2_alloc_meta(ip);
+static inline void release_metapath(struct metapath *mp)
+{
+ int i;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
+ if (mp->mp_bh[i] == NULL)
+ break;
+ brelse(mp->mp_bh[i]);
+ }
+}
- *ptr = cpu_to_be64(*block);
- ip->i_di.di_blocks++;
- gfs2_set_inode_blocks(&ip->i_inode);
+/**
+ * gfs2_extent_length - Returns length of an extent of blocks
+ * @start: Start of the buffer
+ * @len: Length of the buffer in bytes
+ * @ptr: Current position in the buffer
+ * @limit: Max extent length to return (0 = unlimited)
+ * @eob: Set to 1 if we hit "end of block"
+ *
+ * If the first block is zero (unallocated) it will return the number of
+ * unallocated blocks in the extent, otherwise it will return the number
+ * of contiguous blocks in the extent.
+ *
+ * Returns: The length of the extent (minimum of one block)
+ */
- *new = 1;
- return 0;
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
+{
+ const __be64 *end = (start + len);
+ const __be64 *first = ptr;
+ u64 d = be64_to_cpu(*ptr);
+
+ *eob = 0;
+ do {
+ ptr++;
+ if (ptr >= end)
+ break;
+ if (limit && --limit == 0)
+ break;
+ if (d)
+ d++;
+ } while(be64_to_cpu(*ptr) == d);
+ if (ptr >= end)
+ *eob = 1;
+ return (ptr - first);
}
-static inline void bmap_lock(struct inode *inode, int create)
+static inline void bmap_lock(struct gfs2_inode *ip, int create)
{
- struct gfs2_inode *ip = GFS2_I(inode);
if (create)
down_write(&ip->i_rw_mutex);
else
down_read(&ip->i_rw_mutex);
}
-static inline void bmap_unlock(struct inode *inode, int create)
+static inline void bmap_unlock(struct gfs2_inode *ip, int create)
{
- struct gfs2_inode *ip = GFS2_I(inode);
if (create)
up_write(&ip->i_rw_mutex);
else
up_read(&ip->i_rw_mutex);
}
+static inline __be64 *gfs2_indirect_init(struct metapath *mp,
+ struct gfs2_glock *gl, unsigned int i,
+ unsigned offset, u64 bn)
+{
+ __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
+ ((i > 1) ? sizeof(struct gfs2_meta_header) :
+ sizeof(struct gfs2_dinode)));
+ BUG_ON(i < 1);
+ BUG_ON(mp->mp_bh[i] != NULL);
+ mp->mp_bh[i] = gfs2_meta_new(gl, bn);
+ gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
+ gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+ gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
+ ptr += offset;
+ *ptr = cpu_to_be64(bn);
+ return ptr;
+}
+
+enum alloc_state {
+ ALLOC_DATA = 0,
+ ALLOC_GROW_DEPTH = 1,
+ ALLOC_GROW_HEIGHT = 2,
+ /* ALLOC_UNSTUFF = 3, TBD and rather complicated */
+};
+
+/**
+ * gfs2_bmap_alloc - Build a metadata tree of the requested height
+ * @inode: The GFS2 inode
+ * @lblock: The logical starting block of the extent
+ * @bh_map: This is used to return the mapping details
+ * @mp: The metapath
+ * @sheight: The starting height (i.e. whats already mapped)
+ * @height: The height to build to
+ * @maxlen: The max number of data blocks to alloc
+ *
+ * In this routine we may have to alloc:
+ * i) Indirect blocks to grow the metadata tree height
+ * ii) Indirect blocks to fill in lower part of the metadata tree
+ * iii) Data blocks
+ *
+ * The function is in two parts. The first part works out the total
+ * number of blocks which we need. The second part does the actual
+ * allocation asking for an extent at a time (if enough contiguous free
+ * blocks are available, there will only be one request per bmap call)
+ * and uses the state machine to initialise the blocks in order.
+ *
+ * Returns: errno on error
+ */
+
+static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
+ struct buffer_head *bh_map, struct metapath *mp,
+ const unsigned int sheight,
+ const unsigned int height,
+ const unsigned int maxlen)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct buffer_head *dibh = mp->mp_bh[0];
+ u64 bn, dblock = 0;
+ unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
+ unsigned dblks = 0;
+ unsigned ptrs_per_blk;
+ const unsigned end_of_metadata = height - 1;
+ int eob = 0;
+ enum alloc_state state;
+ __be64 *ptr;
+ __be64 zero_bn = 0;
+
+ BUG_ON(sheight < 1);
+ BUG_ON(dibh == NULL);
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+ if (height == sheight) {
+ struct buffer_head *bh;
+ /* Bottom indirect block exists, find unalloced extent size */
+ ptr = metapointer(end_of_metadata, mp);
+ bh = mp->mp_bh[end_of_metadata];
+ dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
+ &eob);
+ BUG_ON(dblks < 1);
+ state = ALLOC_DATA;
+ } else {
+ /* Need to allocate indirect blocks */
+ ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
+ dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
+ if (height == ip->i_height) {
+ /* Writing into existing tree, extend tree down */
+ iblks = height - sheight;
+ state = ALLOC_GROW_DEPTH;
+ } else {
+ /* Building up tree height */
+ state = ALLOC_GROW_HEIGHT;
+ iblks = height - ip->i_height;
+ zmpl = zero_metapath_length(mp, height);
+ iblks -= zmpl;
+ iblks += height;
+ }
+ }
+
+ /* start of the second part of the function (state machine) */
+
+ blks = dblks + iblks;
+ i = sheight;
+ do {
+ n = blks - alloced;
+ bn = gfs2_alloc_block(ip, &n);
+ alloced += n;
+ if (state != ALLOC_DATA || gfs2_is_jdata(ip))
+ gfs2_trans_add_unrevoke(sdp, bn, n);
+ switch (state) {
+ /* Growing height of tree */
+ case ALLOC_GROW_HEIGHT:
+ if (i == 1) {
+ ptr = (__be64 *)(dibh->b_data +
+ sizeof(struct gfs2_dinode));
+ zero_bn = *ptr;
+ }
+ for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
+ gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
+ if (i - 1 == height - ip->i_height) {
+ i--;
+ gfs2_buffer_copy_tail(mp->mp_bh[i],
+ sizeof(struct gfs2_meta_header),
+ dibh, sizeof(struct gfs2_dinode));
+ gfs2_buffer_clear_tail(dibh,
+ sizeof(struct gfs2_dinode) +
+ sizeof(__be64));
+ ptr = (__be64 *)(mp->mp_bh[i]->b_data +
+ sizeof(struct gfs2_meta_header));
+ *ptr = zero_bn;
+ state = ALLOC_GROW_DEPTH;
+ for(i = zmpl; i < height; i++) {
+ if (mp->mp_bh[i] == NULL)
+ break;
+ brelse(mp->mp_bh[i]);
+ mp->mp_bh[i] = NULL;
+ }
+ i = zmpl;
+ }
+ if (n == 0)
+ break;
+ /* Branching from existing tree */
+ case ALLOC_GROW_DEPTH:
+ if (i > 1 && i < height)
+ gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
+ for (; i < height && n > 0; i++, n--)
+ gfs2_indirect_init(mp, ip->i_gl, i,
+ mp->mp_list[i-1], bn++);
+ if (i == height)
+ state = ALLOC_DATA;
+ if (n == 0)
+ break;
+ /* Tree complete, adding data blocks */
+ case ALLOC_DATA:
+ BUG_ON(n > dblks);
+ BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
+ gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
+ dblks = n;
+ ptr = metapointer(end_of_metadata, mp);
+ dblock = bn;
+ while (n-- > 0)
+ *ptr++ = cpu_to_be64(bn++);
+ break;
+ }
+ } while (state != ALLOC_DATA);
+
+ ip->i_height = height;
+ gfs2_add_inode_blocks(&ip->i_inode, alloced);
+ gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
+ map_bh(bh_map, inode->i_sb, dblock);
+ bh_map->b_size = dblks << inode->i_blkbits;
+ set_buffer_new(bh_map);
+ return 0;
+}
+
/**
* gfs2_block_map - Map a block from an inode to a disk block
* @inode: The inode
* @lblock: The logical block number
* @bh_map: The bh to be mapped
+ * @create: True if its ok to alloc blocks to satify the request
*
- * Find the block number on the current device which corresponds to an
- * inode's block. If the block had to be created, "new" will be set.
+ * Sets buffer_mapped() if successful, sets buffer_boundary() if a
+ * read of metadata will be required before the next block can be
+ * mapped. Sets buffer_new() if new blocks were allocated.
*
* Returns: errno
*/
@@ -457,97 +572,78 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct buffer_head *bh;
- unsigned int bsize;
- unsigned int height;
- unsigned int end_of_metadata;
- unsigned int x;
- int error = 0;
- int new = 0;
- u64 dblock = 0;
- int boundary;
- unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
- struct metapath mp;
+ unsigned int bsize = sdp->sd_sb.sb_bsize;
+ const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+ const u64 *arr = sdp->sd_heightsize;
+ __be64 *ptr;
u64 size;
- struct buffer_head *dibh = NULL;
+ struct metapath mp;
+ int ret;
+ int eob;
+ unsigned int len;
+ struct buffer_head *bh;
+ u8 height;
BUG_ON(maxlen == 0);
- if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
- return 0;
-
- bmap_lock(inode, create);
+ memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+ bmap_lock(ip, create);
clear_buffer_mapped(bh_map);
clear_buffer_new(bh_map);
clear_buffer_boundary(bh_map);
- bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
- size = (lblock + 1) * bsize;
-
- if (size > ip->i_di.di_size) {
- height = calc_tree_height(ip, size);
- if (ip->i_di.di_height < height) {
- if (!create)
- goto out_ok;
-
- error = build_height(inode, height);
- if (error)
- goto out_fail;
- }
+ if (gfs2_is_dir(ip)) {
+ bsize = sdp->sd_jbsize;
+ arr = sdp->sd_jheightsize;
}
- find_metapath(ip, lblock, &mp);
- end_of_metadata = ip->i_di.di_height - 1;
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error)
- goto out_fail;
- dibh = bh;
- get_bh(dibh);
+ ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
+ if (ret)
+ goto out;
- for (x = 0; x < end_of_metadata; x++) {
- lookup_block(ip, bh, x, &mp, create, &new, &dblock);
- brelse(bh);
- if (!dblock)
- goto out_ok;
+ height = ip->i_height;
+ size = (lblock + 1) * bsize;
+ while (size > arr[height])
+ height++;
+ find_metapath(sdp, lblock, &mp, height);
+ ret = 1;
+ if (height > ip->i_height || gfs2_is_stuffed(ip))
+ goto do_alloc;
+ ret = lookup_metapath(ip, &mp);
+ if (ret < 0)
+ goto out;
+ if (ret != ip->i_height)
+ goto do_alloc;
+ ptr = metapointer(ip->i_height - 1, &mp);
+ if (*ptr == 0)
+ goto do_alloc;
+ map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
+ bh = mp.mp_bh[ip->i_height - 1];
+ len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
+ bh_map->b_size = (len << inode->i_blkbits);
+ if (eob)
+ set_buffer_boundary(bh_map);
+ ret = 0;
+out:
+ release_metapath(&mp);
+ bmap_unlock(ip, create);
+ return ret;
- error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
- if (error)
- goto out_fail;
+do_alloc:
+ /* All allocations are done here, firstly check create flag */
+ if (!create) {
+ BUG_ON(gfs2_is_stuffed(ip));
+ ret = 0;
+ goto out;
}
- boundary = lookup_block(ip, bh, end_of_metadata, &mp, create, &new, &dblock);
- if (dblock) {
- map_bh(bh_map, inode->i_sb, dblock);
- if (boundary)
- set_buffer_boundary(bh_map);
- if (new) {
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- set_buffer_new(bh_map);
- goto out_brelse;
- }
- while(--maxlen && !buffer_boundary(bh_map)) {
- u64 eblock;
-
- mp.mp_list[end_of_metadata]++;
- boundary = lookup_block(ip, bh, end_of_metadata, &mp, 0, &new, &eblock);
- if (eblock != ++dblock)
- break;
- bh_map->b_size += (1 << inode->i_blkbits);
- if (boundary)
- set_buffer_boundary(bh_map);
- }
- }
-out_brelse:
- brelse(bh);
-out_ok:
- error = 0;
-out_fail:
- if (dibh)
- brelse(dibh);
- bmap_unlock(inode, create);
- return error;
+ /* At this point ret is the tree depth of already allocated blocks */
+ ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
+ goto out;
}
+/*
+ * Deprecated: do not use in new code
+ */
int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
{
struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
@@ -558,7 +654,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
BUG_ON(!dblock);
BUG_ON(!new);
- bh.b_size = 1 << (inode->i_blkbits + 5);
+ bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
ret = gfs2_block_map(inode, lblock, &bh, create);
*extlen = bh.b_size >> inode->i_blkbits;
*dblock = bh.b_blocknr;
@@ -621,7 +717,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
if (error)
goto out;
- if (height < ip->i_di.di_height - 1)
+ if (height < ip->i_height - 1)
for (; top < bottom; top++, first = 0) {
if (!*top)
continue;
@@ -679,7 +775,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
sm->sm_first = 0;
}
- metadata = (height != ip->i_di.di_height - 1);
+ metadata = (height != ip->i_height - 1);
if (metadata)
revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
@@ -713,7 +809,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
else
goto out; /* Nothing to do */
- gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
for (x = 0; x < rlist.rl_rgrps; x++) {
struct gfs2_rgrpd *rgd;
@@ -760,10 +856,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
}
*p = 0;
- if (!ip->i_di.di_blocks)
- gfs2_consist_inode(ip);
- ip->i_di.di_blocks--;
- gfs2_set_inode_blocks(&ip->i_inode);
+ gfs2_add_inode_blocks(&ip->i_inode, -1);
}
if (bstart) {
if (metadata)
@@ -804,19 +897,16 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al;
struct buffer_head *dibh;
- unsigned int h;
int error;
al = gfs2_alloc_get(ip);
+ if (!al)
+ return -ENOMEM;
- error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_lock_check(ip);
if (error)
goto out;
- error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
- if (error)
- goto out_gunlock_q;
-
al->al_requested = sdp->sd_max_height + RES_DATA;
error = gfs2_inplace_reserve(ip);
@@ -829,34 +919,25 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
if (error)
goto out_ipres;
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out_end_trans;
+
if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
if (gfs2_is_stuffed(ip)) {
error = gfs2_unstuff_dinode(ip, NULL);
if (error)
- goto out_end_trans;
- }
-
- h = calc_tree_height(ip, size);
- if (ip->i_di.di_height < h) {
- down_write(&ip->i_rw_mutex);
- error = build_height(&ip->i_inode, h);
- up_write(&ip->i_rw_mutex);
- if (error)
- goto out_end_trans;
+ goto out_brelse;
}
}
ip->i_di.di_size = size;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto out_end_trans;
-
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
+out_brelse:
+ brelse(dibh);
out_end_trans:
gfs2_trans_end(sdp);
out_ipres:
@@ -986,7 +1067,8 @@ out:
static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
{
- unsigned int height = ip->i_di.di_height;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ unsigned int height = ip->i_height;
u64 lblock;
struct metapath mp;
int error;
@@ -994,10 +1076,11 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
if (!size)
lblock = 0;
else
- lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
+ lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
- find_metapath(ip, lblock, &mp);
- gfs2_alloc_get(ip);
+ find_metapath(sdp, lblock, &mp, ip->i_height);
+ if (!gfs2_alloc_get(ip))
+ return -ENOMEM;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
@@ -1037,10 +1120,8 @@ static int trunc_end(struct gfs2_inode *ip)
goto out;
if (!ip->i_di.di_size) {
- ip->i_di.di_height = 0;
- ip->i_di.di_goal_meta =
- ip->i_di.di_goal_data =
- ip->i_no_addr;
+ ip->i_height = 0;
+ ip->i_goal = ip->i_no_addr;
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
}
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
@@ -1197,10 +1278,9 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
unsigned int len, int *alloc_required)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- u64 lblock, lblock_stop, dblock;
- u32 extlen;
- int new = 0;
- int error = 0;
+ struct buffer_head bh;
+ unsigned int shift;
+ u64 lblock, lblock_stop, size;
*alloc_required = 0;
@@ -1214,6 +1294,8 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
return 0;
}
+ *alloc_required = 1;
+ shift = sdp->sd_sb.sb_bsize_shift;
if (gfs2_is_dir(ip)) {
unsigned int bsize = sdp->sd_jbsize;
lblock = offset;
@@ -1221,27 +1303,25 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
lblock_stop = offset + len + bsize - 1;
do_div(lblock_stop, bsize);
} else {
- unsigned int shift = sdp->sd_sb.sb_bsize_shift;
u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
lblock = offset >> shift;
lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
- if (lblock_stop > end_of_file) {
- *alloc_required = 1;
+ if (lblock_stop > end_of_file)
return 0;
- }
}
- for (; lblock < lblock_stop; lblock += extlen) {
- error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
- if (error)
- return error;
-
- if (!dblock) {
- *alloc_required = 1;
+ size = (lblock_stop - lblock) << shift;
+ do {
+ bh.b_state = 0;
+ bh.b_size = size;
+ gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
+ if (!buffer_mapped(&bh))
return 0;
- }
- }
+ size -= bh.b_size;
+ lblock += (bh.b_size >> ip->i_inode.i_blkbits);
+ } while(size > 0);
+ *alloc_required = 0;
return 0;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c34709512b1..eed040d8ba3 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -159,6 +159,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
unsigned int o;
int copied = 0;
int error = 0;
+ int new = 0;
if (!size)
return 0;
@@ -183,7 +184,6 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
while (copied < size) {
unsigned int amount;
struct buffer_head *bh;
- int new = 0;
amount = size - copied;
if (amount > sdp->sd_sb.sb_bsize - o)
@@ -757,7 +757,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
struct gfs2_leaf *leaf;
- unsigned hsize = 1 << ip->i_di.di_depth;
+ unsigned hsize = 1 << ip->i_depth;
unsigned index;
u64 ln;
if (hsize * sizeof(u64) != ip->i_di.di_size) {
@@ -765,7 +765,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
return ERR_PTR(-EIO);
}
- index = name->hash >> (32 - ip->i_di.di_depth);
+ index = name->hash >> (32 - ip->i_depth);
error = get_first_leaf(ip, index, &bh);
if (error)
return ERR_PTR(error);
@@ -803,14 +803,15 @@ got_dent:
static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
{
struct gfs2_inode *ip = GFS2_I(inode);
- u64 bn = gfs2_alloc_meta(ip);
+ unsigned int n = 1;
+ u64 bn = gfs2_alloc_block(ip, &n);
struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
struct gfs2_leaf *leaf;
struct gfs2_dirent *dent;
struct qstr name = { .name = "", .len = 0, .hash = 0 };
if (!bh)
return NULL;
-
+ gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
gfs2_trans_add_bh(ip->i_gl, bh, 1);
gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
leaf = (struct gfs2_leaf *)bh->b_data;
@@ -905,12 +906,11 @@ static int dir_make_exhash(struct inode *inode)
*lp = cpu_to_be64(bn);
dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
- dip->i_di.di_blocks++;
- gfs2_set_inode_blocks(&dip->i_inode);
+ gfs2_add_inode_blocks(&dip->i_inode, 1);
dip->i_di.di_flags |= GFS2_DIF_EXHASH;
for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
- dip->i_di.di_depth = y;
+ dip->i_depth = y;
gfs2_dinode_out(dip, dibh->b_data);
@@ -941,7 +941,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
int x, moved = 0;
int error;
- index = name->hash >> (32 - dip->i_di.di_depth);
+ index = name->hash >> (32 - dip->i_depth);
error = get_leaf_nr(dip, index, &leaf_no);
if (error)
return error;
@@ -952,7 +952,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
return error;
oleaf = (struct gfs2_leaf *)obh->b_data;
- if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
+ if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) {
brelse(obh);
return 1; /* can't split */
}
@@ -967,10 +967,10 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
bn = nbh->b_blocknr;
/* Compute the start and len of leaf pointers in the hash table. */
- len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+ len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
half_len = len >> 1;
if (!half_len) {
- printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
+ printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
gfs2_consist_inode(dip);
error = -EIO;
goto fail_brelse;
@@ -997,7 +997,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
kfree(lp);
/* Compute the divider */
- divider = (start + half_len) << (32 - dip->i_di.di_depth);
+ divider = (start + half_len) << (32 - dip->i_depth);
/* Copy the entries */
dirent_first(dip, obh, &dent);
@@ -1021,13 +1021,13 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
new->de_inum = dent->de_inum; /* No endian worries */
new->de_type = dent->de_type; /* No endian worries */
- nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
+ be16_add_cpu(&nleaf->lf_entries, 1);
dirent_del(dip, obh, prev, dent);
if (!oleaf->lf_entries)
gfs2_consist_inode(dip);
- oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
+ be16_add_cpu(&oleaf->lf_entries, -1);
if (!prev)
prev = dent;
@@ -1044,8 +1044,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
error = gfs2_meta_inode_buffer(dip, &dibh);
if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
gfs2_trans_add_bh(dip->i_gl, dibh, 1);
- dip->i_di.di_blocks++;
- gfs2_set_inode_blocks(&dip->i_inode);
+ gfs2_add_inode_blocks(&dip->i_inode, 1);
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
}
@@ -1082,7 +1081,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
int x;
int error = 0;
- hsize = 1 << dip->i_di.di_depth;
+ hsize = 1 << dip->i_depth;
if (hsize * sizeof(u64) != dip->i_di.di_size) {
gfs2_consist_inode(dip);
return -EIO;
@@ -1090,7 +1089,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
/* Allocate both the "from" and "to" buffers in one big chunk */
- buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+ buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
error = gfs2_dir_read_data(dip, (char *)buf,
@@ -1125,7 +1124,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
error = gfs2_meta_inode_buffer(dip, &dibh);
if (!gfs2_assert_withdraw(sdp, !error)) {
- dip->i_di.di_depth++;
+ dip->i_depth++;
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
}
@@ -1370,16 +1369,16 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
int error = 0;
unsigned depth = 0;
- hsize = 1 << dip->i_di.di_depth;
+ hsize = 1 << dip->i_depth;
if (hsize * sizeof(u64) != dip->i_di.di_size) {
gfs2_consist_inode(dip);
return -EIO;
}
hash = gfs2_dir_offset2hash(*offset);
- index = hash >> (32 - dip->i_di.di_depth);
+ index = hash >> (32 - dip->i_depth);
- lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+ lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
if (!lp)
return -ENOMEM;
@@ -1405,7 +1404,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
if (error)
break;
- len = 1 << (dip->i_di.di_depth - depth);
+ len = 1 << (dip->i_depth - depth);
index = (index & ~(len - 1)) + len;
}
@@ -1444,7 +1443,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
error = -ENOMEM;
/* 96 is max number of dirents which can be stuffed into an inode */
- darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL);
+ darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
if (darr) {
g.pdent = darr;
g.offset = 0;
@@ -1549,7 +1548,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
u32 index;
u64 bn;
- index = name->hash >> (32 - ip->i_di.di_depth);
+ index = name->hash >> (32 - ip->i_depth);
error = get_first_leaf(ip, index, &obh);
if (error)
return error;
@@ -1579,8 +1578,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
if (error)
return error;
gfs2_trans_add_bh(ip->i_gl, bh, 1);
- ip->i_di.di_blocks++;
- gfs2_set_inode_blocks(&ip->i_inode);
+ gfs2_add_inode_blocks(&ip->i_inode, 1);
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
return 0;
@@ -1616,7 +1614,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
dent->de_type = cpu_to_be16(type);
if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
leaf = (struct gfs2_leaf *)bh->b_data;
- leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
+ be16_add_cpu(&leaf->lf_entries, 1);
}
brelse(bh);
error = gfs2_meta_inode_buffer(ip, &bh);
@@ -1641,7 +1639,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
continue;
if (error < 0)
break;
- if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+ if (ip->i_depth < GFS2_DIR_MAX_DEPTH) {
error = dir_double_exhash(ip);
if (error)
break;
@@ -1785,13 +1783,13 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
u64 leaf_no;
int error = 0;
- hsize = 1 << dip->i_di.di_depth;
+ hsize = 1 << dip->i_depth;
if (hsize * sizeof(u64) != dip->i_di.di_size) {
gfs2_consist_inode(dip);
return -EIO;
}
- lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+ lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
if (!lp)
return -ENOMEM;
@@ -1817,7 +1815,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
if (error)
goto out;
leaf = (struct gfs2_leaf *)bh->b_data;
- len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
+ len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
brelse(bh);
error = lc(dip, index, len, leaf_no, data);
@@ -1866,15 +1864,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
- ht = kzalloc(size, GFP_KERNEL);
+ ht = kzalloc(size, GFP_NOFS);
if (!ht)
return -ENOMEM;
- gfs2_alloc_get(dip);
+ if (!gfs2_alloc_get(dip)) {
+ error = -ENOMEM;
+ goto out;
+ }
error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
- goto out;
+ goto out_put;
error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
if (error)
@@ -1894,7 +1895,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
l_blocks++;
}
- gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
for (x = 0; x < rlist.rl_rgrps; x++) {
struct gfs2_rgrpd *rgd;
@@ -1921,11 +1922,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
brelse(bh);
gfs2_free_meta(dip, blk, 1);
-
- if (!dip->i_di.di_blocks)
- gfs2_consist_inode(dip);
- dip->i_di.di_blocks--;
- gfs2_set_inode_blocks(&dip->i_inode);
+ gfs2_add_inode_blocks(&dip->i_inode, -1);
}
error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
@@ -1952,8 +1949,9 @@ out_rlist:
gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
out_qs:
gfs2_quota_unhold(dip);
-out:
+out_put:
gfs2_alloc_put(dip);
+out:
kfree(ht);
return error;
}
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index bee99704ea1..e3f76f451b0 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -277,10 +277,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
}
*dataptrs = 0;
- if (!ip->i_di.di_blocks)
- gfs2_consist_inode(ip);
- ip->i_di.di_blocks--;
- gfs2_set_inode_blocks(&ip->i_inode);
+ gfs2_add_inode_blocks(&ip->i_inode, -1);
}
if (bstart)
gfs2_free_meta(ip, bstart, blen);
@@ -321,6 +318,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
int error;
al = gfs2_alloc_get(ip);
+ if (!al)
+ return -ENOMEM;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
@@ -449,7 +448,7 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
unsigned int x;
int error = 0;
- bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+ bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
if (!bh)
return -ENOMEM;
@@ -582,10 +581,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_ea_header *ea;
+ unsigned int n = 1;
u64 block;
- block = gfs2_alloc_meta(ip);
-
+ block = gfs2_alloc_block(ip, &n);
+ gfs2_trans_add_unrevoke(sdp, block, 1);
*bhp = gfs2_meta_new(ip->i_gl, block);
gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
@@ -597,8 +597,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
ea->ea_flags = GFS2_EAFLAG_LAST;
ea->ea_num_ptrs = 0;
- ip->i_di.di_blocks++;
- gfs2_set_inode_blocks(&ip->i_inode);
+ gfs2_add_inode_blocks(&ip->i_inode, 1);
return 0;
}
@@ -642,15 +641,15 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
struct buffer_head *bh;
u64 block;
int mh_size = sizeof(struct gfs2_meta_header);
+ unsigned int n = 1;
- block = gfs2_alloc_meta(ip);
-
+ block = gfs2_alloc_block(ip, &n);
+ gfs2_trans_add_unrevoke(sdp, block, 1);
bh = gfs2_meta_new(ip->i_gl, block);
gfs2_trans_add_bh(ip->i_gl, bh, 1);
gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
- ip->i_di.di_blocks++;
- gfs2_set_inode_blocks(&ip->i_inode);
+ gfs2_add_inode_blocks(&ip->i_inode, 1);
copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
data_len;
@@ -684,15 +683,13 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
int error;
al = gfs2_alloc_get(ip);
+ if (!al)
+ return -ENOMEM;
- error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_lock_check(ip);
if (error)
goto out;
- error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
- if (error)
- goto out_gunlock_q;
-
al->al_requested = blks;
error = gfs2_inplace_reserve(ip);
@@ -966,9 +963,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
gfs2_trans_add_bh(ip->i_gl, indbh, 1);
} else {
u64 blk;
-
- blk = gfs2_alloc_meta(ip);
-
+ unsigned int n = 1;
+ blk = gfs2_alloc_block(ip, &n);
+ gfs2_trans_add_unrevoke(sdp, blk, 1);
indbh = gfs2_meta_new(ip->i_gl, blk);
gfs2_trans_add_bh(ip->i_gl, indbh, 1);
gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
@@ -978,8 +975,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
*eablk = cpu_to_be64(ip->i_di.di_eattr);
ip->i_di.di_eattr = blk;
ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
- ip->i_di.di_blocks++;
- gfs2_set_inode_blocks(&ip->i_inode);
+ gfs2_add_inode_blocks(&ip->i_inode, 1);
eablk++;
}
@@ -1210,7 +1206,7 @@ static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
unsigned int x;
int error;
- bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+ bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
if (!bh)
return -ENOMEM;
@@ -1347,7 +1343,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
else
goto out;
- gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
for (x = 0; x < rlist.rl_rgrps; x++) {
struct gfs2_rgrpd *rgd;
@@ -1387,10 +1383,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
}
*eablk = 0;
- if (!ip->i_di.di_blocks)
- gfs2_consist_inode(ip);
- ip->i_di.di_blocks--;
- gfs2_set_inode_blocks(&ip->i_inode);
+ gfs2_add_inode_blocks(&ip->i_inode, -1);
}
if (bstart)
gfs2_free_meta(ip, bstart, blen);
@@ -1442,10 +1435,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
ip->i_di.di_eattr = 0;
- if (!ip->i_di.di_blocks)
- gfs2_consist_inode(ip);
- ip->i_di.di_blocks--;
- gfs2_set_inode_blocks(&ip->i_inode);
+ gfs2_add_inode_blocks(&ip->i_inode, -1);
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
@@ -1474,6 +1464,8 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
int error;
al = gfs2_alloc_get(ip);
+ if (!al)
+ return -ENOMEM;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7175a4d0643..d636b3e80f5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,6 @@
#include "glock.h"
#include "glops.h"
#include "inode.h"
-#include "lm.h"
#include "lops.h"
#include "meta_io.h"
#include "quota.h"
@@ -183,7 +182,8 @@ static void glock_free(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
struct inode *aspace = gl->gl_aspace;
- gfs2_lm_put_lock(sdp, gl->gl_lock);
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
if (aspace)
gfs2_aspace_put(aspace);
@@ -197,7 +197,7 @@ static void glock_free(struct gfs2_glock *gl)
*
*/
-void gfs2_glock_hold(struct gfs2_glock *gl)
+static void gfs2_glock_hold(struct gfs2_glock *gl)
{
atomic_inc(&gl->gl_ref);
}
@@ -293,6 +293,16 @@ static void glock_work_func(struct work_struct *work)
gfs2_glock_put(gl);
}
+static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ void **lockp)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
+ sdp->sd_lockstruct.ls_lockspace, name, lockp);
+ return error;
+}
+
/**
* gfs2_glock_get() - Get a glock, or create one if one doesn't exist
* @sdp: The GFS2 superblock
@@ -338,8 +348,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_ip = 0;
gl->gl_ops = glops;
gl->gl_req_gh = NULL;
- gl->gl_req_bh = NULL;
- gl->gl_vn = 0;
gl->gl_stamp = jiffies;
gl->gl_tchange = jiffies;
gl->gl_object = NULL;
@@ -595,11 +603,12 @@ static void run_queue(struct gfs2_glock *gl)
blocked = rq_mutex(gh);
} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
blocked = rq_demote(gl);
- if (gl->gl_waiters2 && !blocked) {
+ if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
+ !blocked) {
set_bit(GLF_DEMOTE, &gl->gl_flags);
gl->gl_demote_state = LM_ST_UNLOCKED;
}
- gl->gl_waiters2 = 0;
+ clear_bit(GLF_WAITERS2, &gl->gl_flags);
} else if (!list_empty(&gl->gl_waiters3)) {
gh = list_entry(gl->gl_waiters3.next,
struct gfs2_holder, gh_list);
@@ -710,7 +719,7 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != state) {
if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
- gl->gl_waiters2 = 1;
+ set_bit(GLF_WAITERS2, &gl->gl_flags);
else
gl->gl_demote_state = LM_ST_UNLOCKED;
}
@@ -743,6 +752,43 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
}
/**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_holder *gh = gl->gl_req_gh;
+
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+ gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
+ gfs2_assert_warn(sdp, !ret);
+
+ state_change(gl, LM_ST_UNLOCKED);
+
+ if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
+ spin_lock(&gl->gl_spin);
+ gh->gh_error = 0;
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_xmote_th(gl, gl->gl_req_gh);
+ gfs2_glock_put(gl);
+ return;
+ }
+
+ spin_lock(&gl->gl_spin);
+ gfs2_demote_wake(gl);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_put(gl);
+}
+
+/**
* xmote_bh - Called after the lock module is done acquiring a lock
* @gl: The glock in question
* @ret: the int returned from the lock module
@@ -754,25 +800,19 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
struct gfs2_sbd *sdp = gl->gl_sbd;
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh = gl->gl_req_gh;
- int prev_state = gl->gl_state;
int op_done = 1;
+ if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
+ drop_bh(gl, ret);
+ return;
+ }
+
gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
state_change(gl, ret & LM_OUT_ST_MASK);
- if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
- if (glops->go_inval)
- glops->go_inval(gl, DIO_METADATA);
- } else if (gl->gl_state == LM_ST_DEFERRED) {
- /* We might not want to do this here.
- Look at moving to the inode glops. */
- if (glops->go_inval)
- glops->go_inval(gl, 0);
- }
-
/* Deal with each possible exit condition */
if (!gh) {
@@ -782,7 +822,6 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
} else {
spin_lock(&gl->gl_spin);
if (gl->gl_state != gl->gl_demote_state) {
- gl->gl_req_bh = NULL;
spin_unlock(&gl->gl_spin);
gfs2_glock_drop_th(gl);
gfs2_glock_put(gl);
@@ -793,6 +832,14 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
}
} else {
spin_lock(&gl->gl_spin);
+ if (ret & LM_OUT_CONV_DEADLK) {
+ gh->gh_error = 0;
+ set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_drop_th(gl);
+ gfs2_glock_put(gl);
+ return;
+ }
list_del_init(&gh->gh_list);
gh->gh_error = -EIO;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
@@ -824,7 +871,6 @@ out:
if (op_done) {
spin_lock(&gl->gl_spin);
gl->gl_req_gh = NULL;
- gl->gl_req_bh = NULL;
clear_bit(GLF_LOCK, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
}
@@ -835,6 +881,17 @@ out:
gfs2_holder_wake(gh);
}
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+ unsigned int cur_state, unsigned int req_state,
+ unsigned int flags)
+{
+ int ret = 0;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+ req_state, flags);
+ return ret;
+}
+
/**
* gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
* @gl: The glock in question
@@ -856,6 +913,8 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
if (glops->go_xmote_th)
glops->go_xmote_th(gl);
+ if (state == LM_ST_DEFERRED && glops->go_inval)
+ glops->go_inval(gl, DIO_METADATA);
gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -863,7 +922,6 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
gfs2_assert_warn(sdp, state != gl->gl_state);
gfs2_glock_hold(gl);
- gl->gl_req_bh = xmote_bh;
lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
@@ -876,49 +934,13 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
xmote_bh(gl, lck_ret);
}
-/**
- * drop_bh - Called after a lock module unlock completes
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+ unsigned int cur_state)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_holder *gh = gl->gl_req_gh;
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, !ret);
-
- state_change(gl, LM_ST_UNLOCKED);
-
- if (glops->go_inval)
- glops->go_inval(gl, DIO_METADATA);
-
- if (gh) {
- spin_lock(&gl->gl_spin);
- list_del_init(&gh->gh_list);
- gh->gh_error = 0;
- spin_unlock(&gl->gl_spin);
- }
-
- spin_lock(&gl->gl_spin);
- gfs2_demote_wake(gl);
- gl->gl_req_gh = NULL;
- gl->gl_req_bh = NULL;
- clear_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
-
- gfs2_glock_put(gl);
-
- if (gh)
- gfs2_holder_wake(gh);
+ int ret = 0;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
+ return ret;
}
/**
@@ -935,13 +957,14 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
if (glops->go_xmote_th)
glops->go_xmote_th(gl);
+ if (glops->go_inval)
+ glops->go_inval(gl, DIO_METADATA);
gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
gfs2_glock_hold(gl);
- gl->gl_req_bh = drop_bh;
ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
@@ -964,16 +987,17 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
static void do_cancels(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
spin_lock(&gl->gl_spin);
while (gl->gl_req_gh != gh &&
!test_bit(HIF_HOLDER, &gh->gh_iflags) &&
!list_empty(&gh->gh_list)) {
- if (gl->gl_req_bh && !(gl->gl_req_gh &&
- (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
+ if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
spin_unlock(&gl->gl_spin);
- gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
msleep(100);
spin_lock(&gl->gl_spin);
} else {
@@ -1041,7 +1065,6 @@ static int glock_wait_internal(struct gfs2_holder *gh)
spin_lock(&gl->gl_spin);
gl->gl_req_gh = NULL;
- gl->gl_req_bh = NULL;
clear_bit(GLF_LOCK, &gl->gl_flags);
run_queue(gl);
spin_unlock(&gl->gl_spin);
@@ -1428,6 +1451,14 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
gfs2_glock_dq_uninit(&ghs[x]);
}
+static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
+ return error;
+}
+
/**
* gfs2_lvb_hold - attach a LVB from a glock
* @gl: The glock in question
@@ -1463,12 +1494,15 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
void gfs2_lvb_unhold(struct gfs2_glock *gl)
{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+
gfs2_glock_hold(gl);
gfs2_glmutex_lock(gl);
gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
if (atomic_dec_and_test(&gl->gl_lvb_count)) {
- gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
gl->gl_lvb = NULL;
gfs2_glock_put(gl);
}
@@ -1534,8 +1568,7 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
gl = gfs2_glock_find(sdp, &async->lc_name);
if (gfs2_assert_warn(sdp, gl))
return;
- if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
- gl->gl_req_bh(gl, async->lc_ret);
+ xmote_bh(gl, async->lc_ret);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
up_read(&gfs2_umount_flush_sem);
@@ -1594,10 +1627,10 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
gfs2_glock_hold(gl);
list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
atomic_inc(&sdp->sd_reclaim_count);
- }
- spin_unlock(&sdp->sd_reclaim_lock);
-
- wake_up(&sdp->sd_reclaim_wq);
+ spin_unlock(&sdp->sd_reclaim_lock);
+ wake_up(&sdp->sd_reclaim_wq);
+ } else
+ spin_unlock(&sdp->sd_reclaim_lock);
}
/**
@@ -1897,7 +1930,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
print_dbg(gi, " gl_owner = -1\n");
print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip);
print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
- print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
print_dbg(gi, " reclaim = %s\n",
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2f9c6d136b3..cdad3e6f815 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -32,24 +32,23 @@
#define GLR_TRYFAILED 13
#define GLR_CANCELED 14
-static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
struct gfs2_holder *gh;
- int locked = 0;
struct pid *pid;
/* Look in glock's list of holders for one with current task as owner */
spin_lock(&gl->gl_spin);
pid = task_pid(current);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
- if (gh->gh_owner_pid == pid) {
- locked = 1;
- break;
- }
+ if (gh->gh_owner_pid == pid)
+ goto out;
}
+ gh = NULL;
+out:
spin_unlock(&gl->gl_spin);
- return locked;
+ return gh;
}
static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
@@ -79,7 +78,6 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
int gfs2_glock_get(struct gfs2_sbd *sdp,
u64 number, const struct gfs2_glock_operations *glops,
int create, struct gfs2_glock **glp);
-void gfs2_glock_hold(struct gfs2_glock *gl);
int gfs2_glock_put(struct gfs2_glock *gl);
void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c663b7a0f41..d31badadef8 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -126,7 +126,13 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
return;
gfs2_meta_inval(gl);
- gl->gl_vn++;
+ if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex))
+ gl->gl_sbd->sd_rindex_uptodate = 0;
+ else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) {
+ struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+
+ rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+ }
}
/**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 525dcae352d..9c2c0b90b22 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -44,7 +44,6 @@ struct gfs2_log_header_host {
struct gfs2_log_operations {
void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
- void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
void (*lo_before_commit) (struct gfs2_sbd *sdp);
void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
void (*lo_before_scan) (struct gfs2_jdesc *jd,
@@ -70,7 +69,6 @@ struct gfs2_bitmap {
};
struct gfs2_rgrp_host {
- u32 rg_flags;
u32 rg_free;
u32 rg_dinodes;
u64 rg_igeneration;
@@ -87,17 +85,17 @@ struct gfs2_rgrpd {
u32 rd_data; /* num of data blocks in rgrp */
u32 rd_bitbytes; /* number of bytes in data bitmaps */
struct gfs2_rgrp_host rd_rg;
- u64 rd_rg_vn;
struct gfs2_bitmap *rd_bits;
unsigned int rd_bh_count;
struct mutex rd_mutex;
u32 rd_free_clone;
struct gfs2_log_element rd_le;
- u32 rd_last_alloc_data;
- u32 rd_last_alloc_meta;
+ u32 rd_last_alloc;
struct gfs2_sbd *rd_sbd;
- unsigned long rd_flags;
-#define GFS2_RDF_CHECK 0x0001 /* Need to check for unlinked inodes */
+ unsigned char rd_flags;
+#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */
+#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */
+#define GFS2_RDF_UPTODATE 0x04 /* rg is up to date */
};
enum gfs2_state_bits {
@@ -168,6 +166,8 @@ enum {
GLF_DIRTY = 5,
GLF_DEMOTE_IN_PROGRESS = 6,
GLF_LFLUSH = 7,
+ GLF_WAITERS2 = 8,
+ GLF_CONV_DEADLK = 9,
};
struct gfs2_glock {
@@ -187,18 +187,15 @@ struct gfs2_glock {
struct list_head gl_holders;
struct list_head gl_waiters1; /* HIF_MUTEX */
struct list_head gl_waiters3; /* HIF_PROMOTE */
- int gl_waiters2; /* GIF_DEMOTE */
const struct gfs2_glock_operations *gl_ops;
struct gfs2_holder *gl_req_gh;
- gfs2_glop_bh_t gl_req_bh;
void *gl_lock;
char *gl_lvb;
atomic_t gl_lvb_count;
- u64 gl_vn;
unsigned long gl_stamp;
unsigned long gl_tchange;
void *gl_object;
@@ -213,6 +210,8 @@ struct gfs2_glock {
struct delayed_work gl_work;
};
+#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
+
struct gfs2_alloc {
/* Quota stuff */
@@ -241,14 +240,9 @@ enum {
struct gfs2_dinode_host {
u64 di_size; /* number of bytes in file */
- u64 di_blocks; /* number of blocks in file */
- u64 di_goal_meta; /* rgrp to alloc from next */
- u64 di_goal_data; /* data block goal */
u64 di_generation; /* generation number for NFS */
u32 di_flags; /* GFS2_DIF_... */
- u16 di_height; /* height of metadata */
/* These only apply to directories */
- u16 di_depth; /* Number of bits in the table */
u32 di_entries; /* The number of entries in the directory */
u64 di_eattr; /* extended attribute block number */
};
@@ -265,9 +259,10 @@ struct gfs2_inode {
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
struct gfs2_alloc *i_alloc;
- u64 i_last_rg_alloc;
-
+ u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
+ u8 i_height;
+ u8 i_depth;
};
/*
@@ -490,9 +485,9 @@ struct gfs2_sbd {
u32 sd_qc_per_block;
u32 sd_max_dirres; /* Max blocks needed to add a directory entry */
u32 sd_max_height; /* Max height of a file's metadata tree */
- u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
+ u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
u32 sd_max_jheight; /* Max height of journaled file's meta tree */
- u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
+ u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
struct gfs2_args sd_args; /* Mount arguments */
struct gfs2_tune sd_tune; /* Filesystem tuning structure */
@@ -533,7 +528,7 @@ struct gfs2_sbd {
/* Resource group stuff */
- u64 sd_rindex_vn;
+ int sd_rindex_uptodate;
spinlock_t sd_rindex_spin;
struct mutex sd_rindex_mutex;
struct list_head sd_rindex_list;
@@ -637,9 +632,6 @@ struct gfs2_sbd {
/* Counters */
- atomic_t sd_glock_count;
- atomic_t sd_glock_held_count;
- atomic_t sd_inode_count;
atomic_t sd_reclaimed;
char sd_fsname[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 37725ade3c5..3a9ef526c30 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -149,7 +149,8 @@ void gfs2_set_iop(struct inode *inode)
} else if (S_ISLNK(mode)) {
inode->i_op = &gfs2_symlink_iops;
} else {
- inode->i_op = &gfs2_dev_iops;
+ inode->i_op = &gfs2_file_iops;
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
}
unlock_new_inode(inode);
@@ -248,12 +249,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
{
struct gfs2_dinode_host *di = &ip->i_di;
const struct gfs2_dinode *str = buf;
+ u16 height, depth;
- if (ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)) {
- if (gfs2_consist_inode(ip))
- gfs2_dinode_print(ip);
- return -EIO;
- }
+ if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
+ goto corrupt;
ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
ip->i_inode.i_rdev = 0;
@@ -275,8 +274,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
di->di_size = be64_to_cpu(str->di_size);
i_size_write(&ip->i_inode, di->di_size);
- di->di_blocks = be64_to_cpu(str->di_blocks);
- gfs2_set_inode_blocks(&ip->i_inode);
+ gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
@@ -284,15 +282,20 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
- di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
- di->di_goal_data = be64_to_cpu(str->di_goal_data);
+ ip->i_goal = be64_to_cpu(str->di_goal_meta);
di->di_generation = be64_to_cpu(str->di_generation);
di->di_flags = be32_to_cpu(str->di_flags);
gfs2_set_inode_flags(&ip->i_inode);
- di->di_height = be16_to_cpu(str->di_height);
-
- di->di_depth = be16_to_cpu(str->di_depth);
+ height = be16_to_cpu(str->di_height);
+ if (unlikely(height > GFS2_MAX_META_HEIGHT))
+ goto corrupt;
+ ip->i_height = (u8)height;
+
+ depth = be16_to_cpu(str->di_depth);
+ if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
+ goto corrupt;
+ ip->i_depth = (u8)depth;
di->di_entries = be32_to_cpu(str->di_entries);
di->di_eattr = be64_to_cpu(str->di_eattr);
@@ -300,6 +303,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
gfs2_set_aops(&ip->i_inode);
return 0;
+corrupt:
+ if (gfs2_consist_inode(ip))
+ gfs2_dinode_print(ip);
+ return -EIO;
}
/**
@@ -337,13 +344,15 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
struct gfs2_rgrpd *rgd;
int error;
- if (ip->i_di.di_blocks != 1) {
+ if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
if (gfs2_consist_inode(ip))
gfs2_dinode_print(ip);
return -EIO;
}
al = gfs2_alloc_get(ip);
+ if (!al)
+ return -ENOMEM;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
@@ -487,7 +496,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
return dir;
}
- if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) {
+ if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) {
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
if (error)
return ERR_PTR(error);
@@ -818,7 +827,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
int error;
munge_mode_uid_gid(dip, &mode, &uid, &gid);
- gfs2_alloc_get(dip);
+ if (!gfs2_alloc_get(dip))
+ return -ENOMEM;
error = gfs2_quota_lock(dip, uid, gid);
if (error)
@@ -853,6 +863,8 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
int error;
al = gfs2_alloc_get(dip);
+ if (!al)
+ return -ENOMEM;
error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
@@ -1219,7 +1231,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
x = ip->i_di.di_size + 1;
if (x > *len) {
- *buf = kmalloc(x, GFP_KERNEL);
+ *buf = kmalloc(x, GFP_NOFS);
if (!*buf) {
error = -ENOMEM;
goto out_brelse;
@@ -1391,21 +1403,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
str->di_size = cpu_to_be64(di->di_size);
- str->di_blocks = cpu_to_be64(di->di_blocks);
+ str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
- str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
- str->di_goal_data = cpu_to_be64(di->di_goal_data);
+ str->di_goal_meta = cpu_to_be64(ip->i_goal);
+ str->di_goal_data = cpu_to_be64(ip->i_goal);
str->di_generation = cpu_to_be64(di->di_generation);
str->di_flags = cpu_to_be32(di->di_flags);
- str->di_height = cpu_to_be16(di->di_height);
+ str->di_height = cpu_to_be16(ip->i_height);
str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
!(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
GFS2_FORMAT_DE : 0);
- str->di_depth = cpu_to_be16(di->di_depth);
+ str->di_depth = cpu_to_be16(ip->i_depth);
str->di_entries = cpu_to_be32(di->di_entries);
str->di_eattr = cpu_to_be64(di->di_eattr);
@@ -1423,15 +1435,13 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
printk(KERN_INFO " no_addr = %llu\n",
(unsigned long long)ip->i_no_addr);
printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
- printk(KERN_INFO " di_blocks = %llu\n",
- (unsigned long long)di->di_blocks);
- printk(KERN_INFO " di_goal_meta = %llu\n",
- (unsigned long long)di->di_goal_meta);
- printk(KERN_INFO " di_goal_data = %llu\n",
- (unsigned long long)di->di_goal_data);
+ printk(KERN_INFO " blocks = %llu\n",
+ (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
+ printk(KERN_INFO " i_goal = %llu\n",
+ (unsigned long long)ip->i_goal);
printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags);
- printk(KERN_INFO " di_height = %u\n", di->di_height);
- printk(KERN_INFO " di_depth = %u\n", di->di_depth);
+ printk(KERN_INFO " i_height = %u\n", ip->i_height);
+ printk(KERN_INFO " i_depth = %u\n", ip->i_depth);
printk(KERN_INFO " di_entries = %u\n", di->di_entries);
printk(KERN_INFO " di_eattr = %llu\n",
(unsigned long long)di->di_eattr);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d4465066261..580da454b38 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,9 +10,11 @@
#ifndef __INODE_DOT_H__
#define __INODE_DOT_H__
+#include "util.h"
+
static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
{
- return !ip->i_di.di_height;
+ return !ip->i_height;
}
static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
@@ -37,13 +39,25 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
return S_ISDIR(ip->i_inode.i_mode);
}
-static inline void gfs2_set_inode_blocks(struct inode *inode)
+static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
+{
+ inode->i_blocks = blocks <<
+ (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+}
+
+static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
{
- struct gfs2_inode *ip = GFS2_I(inode);
- inode->i_blocks = ip->i_di.di_blocks <<
+ return inode->i_blocks >>
(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
}
+static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
+{
+ gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
+ change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+ inode->i_blocks += change;
+}
+
static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr,
u64 no_formal_ino)
{
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
deleted file mode 100644
index cfcc39b86a5..00000000000
--- a/fs/gfs2/lm.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "lm.h"
-#include "super.h"
-#include "util.h"
-
-/**
- * gfs2_lm_mount - mount a locking protocol
- * @sdp: the filesystem
- * @args: mount arguements
- * @silent: if 1, don't complain if the FS isn't a GFS2 fs
- *
- * Returns: errno
- */
-
-int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
-{
- char *proto = sdp->sd_proto_name;
- char *table = sdp->sd_table_name;
- int flags = 0;
- int error;
-
- if (sdp->sd_args.ar_spectator)
- flags |= LM_MFLAG_SPECTATOR;
-
- fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
-
- error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
- gfs2_glock_cb, sdp,
- GFS2_MIN_LVB_SIZE, flags,
- &sdp->sd_lockstruct, &sdp->sd_kobj);
- if (error) {
- fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
- proto, table, sdp->sd_args.ar_hostdata);
- goto out;
- }
-
- if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
- gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
- gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
- GFS2_MIN_LVB_SIZE)) {
- gfs2_unmount_lockproto(&sdp->sd_lockstruct);
- goto out;
- }
-
- if (sdp->sd_args.ar_spectator)
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
- else
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
- sdp->sd_lockstruct.ls_jid);
-
- fs_info(sdp, "Joined cluster. Now mounting FS...\n");
-
- if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
- !sdp->sd_args.ar_ignore_local_fs) {
- sdp->sd_args.ar_localflocks = 1;
- sdp->sd_args.ar_localcaching = 1;
- }
-
-out:
- return error;
-}
-
-void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
-{
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
- sdp->sd_lockstruct.ls_lockspace);
-}
-
-void gfs2_lm_unmount(struct gfs2_sbd *sdp)
-{
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-}
-
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
-{
- va_list args;
-
- if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
- return 0;
-
- va_start(args, fmt);
- vprintk(fmt, args);
- va_end(args);
-
- fs_err(sdp, "about to withdraw this file system\n");
- BUG_ON(sdp->sd_args.ar_debug);
-
- fs_err(sdp, "telling LM to withdraw\n");
- gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
- fs_err(sdp, "withdrawn\n");
- dump_stack();
-
- return -1;
-}
-
-int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
- void **lockp)
-{
- int error = -EIO;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
- sdp->sd_lockstruct.ls_lockspace, name, lockp);
- return error;
-}
-
-void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
-{
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
-}
-
-unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state, unsigned int req_state,
- unsigned int flags)
-{
- int ret = 0;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
- req_state, flags);
- return ret;
-}
-
-unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state)
-{
- int ret = 0;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
- return ret;
-}
-
-void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
-{
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
-}
-
-int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
-{
- int error = -EIO;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
- return error;
-}
-
-void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
-{
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
-}
-
-int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
-{
- int error = -EIO;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
- sdp->sd_lockstruct.ls_lockspace, name, file, fl);
- return error;
-}
-
-int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
- struct file *file, int cmd, struct file_lock *fl)
-{
- int error = -EIO;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- error = sdp->sd_lockstruct.ls_ops->lm_plock(
- sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
- return error;
-}
-
-int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
-{
- int error = -EIO;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- error = sdp->sd_lockstruct.ls_ops->lm_punlock(
- sdp->sd_lockstruct.ls_lockspace, name, file, fl);
- return error;
-}
-
-void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
- unsigned int message)
-{
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- sdp->sd_lockstruct.ls_ops->lm_recovery_done(
- sdp->sd_lockstruct.ls_lockspace, jid, message);
-}
-
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
deleted file mode 100644
index 21cdc30ee08..00000000000
--- a/fs/gfs2/lm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __LM_DOT_H__
-#define __LM_DOT_H__
-
-struct gfs2_sbd;
-
-#define GFS2_MIN_LVB_SIZE 32
-
-int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
-void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
-void gfs2_lm_unmount(struct gfs2_sbd *sdp);
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
- __attribute__ ((format(printf, 2, 3)));
-int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
- void **lockp);
-void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
-unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state, unsigned int req_state,
- unsigned int flags);
-unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state);
-void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
-int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
-void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
-int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
- struct file *file, struct file_lock *fl);
-int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
- struct file *file, int cmd, struct file_lock *fl);
-int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
- struct file *file, struct file_lock *fl);
-void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
- unsigned int message);
-
-#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
index 89b93b6b45c..2609bb6cd01 100644
--- a/fs/gfs2/locking/dlm/Makefile
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -1,3 +1,3 @@
obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
-lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
+lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index 542a797ac89..cf7ea8abec8 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -137,7 +137,8 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
/* Conversion deadlock avoidance by DLM */
- if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
+ if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
+ !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
!(lkf & DLM_LKF_NOQUEUE) &&
cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
lkf |= DLM_LKF_CONVDEADLK;
@@ -164,7 +165,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
{
struct gdlm_lock *lp;
- lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
+ lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
if (!lp)
return -ENOMEM;
@@ -382,7 +383,7 @@ static int gdlm_add_lvb(struct gdlm_lock *lp)
{
char *lvb;
- lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
+ lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
if (!lvb)
return -ENOMEM;
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 9e8265d2837..a243cf69c54 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -25,6 +25,7 @@
#include <net/sock.h>
#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
#include <linux/lm_interface.h>
/*
@@ -173,15 +174,9 @@ void gdlm_cancel(void *);
int gdlm_hold_lvb(void *, char **);
void gdlm_unhold_lvb(void *, char *);
-/* plock.c */
+/* mount.c */
+
+extern const struct lm_lockops gdlm_ops;
-int gdlm_plock_init(void);
-void gdlm_plock_exit(void);
-int gdlm_plock(void *, struct lm_lockname *, struct file *, int,
- struct file_lock *);
-int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
- struct file_lock *);
-int gdlm_punlock(void *, struct lm_lockname *, struct file *,
- struct file_lock *);
#endif
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
index a0e7eda643e..b9a03a7ff80 100644
--- a/fs/gfs2/locking/dlm/main.c
+++ b/fs/gfs2/locking/dlm/main.c
@@ -11,8 +11,6 @@
#include "lock_dlm.h"
-extern struct lm_lockops gdlm_ops;
-
static int __init init_lock_dlm(void)
{
int error;
@@ -30,13 +28,6 @@ static int __init init_lock_dlm(void)
return error;
}
- error = gdlm_plock_init();
- if (error) {
- gdlm_sysfs_exit();
- gfs2_unregister_lockproto(&gdlm_ops);
- return error;
- }
-
printk(KERN_INFO
"Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
return 0;
@@ -44,7 +35,6 @@ static int __init init_lock_dlm(void)
static void __exit exit_lock_dlm(void)
{
- gdlm_plock_exit();
gdlm_sysfs_exit();
gfs2_unregister_lockproto(&gdlm_ops);
}
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index f2efff42422..470bdf650b5 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -236,6 +236,27 @@ static void gdlm_withdraw(void *lockspace)
gdlm_kobject_release(ls);
}
+static int gdlm_plock(void *lockspace, struct lm_lockname *name,
+ struct file *file, int cmd, struct file_lock *fl)
+{
+ struct gdlm_ls *ls = lockspace;
+ return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl);
+}
+
+static int gdlm_punlock(void *lockspace, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ struct gdlm_ls *ls = lockspace;
+ return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl);
+}
+
+static int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ struct gdlm_ls *ls = lockspace;
+ return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl);
+}
+
const struct lm_lockops gdlm_ops = {
.lm_proto_name = "lock_dlm",
.lm_mount = gdlm_mount,
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a87b0983976..8479da47049 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -12,8 +12,6 @@
#include "lock_dlm.h"
-extern struct lm_lockops gdlm_ops;
-
static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
{
return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index 521694fc19d..e53db6fd28a 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -135,7 +135,15 @@ static void process_complete(struct gdlm_lock *lp)
lp->lksb.sb_status, lp->lockname.ln_type,
(unsigned long long)lp->lockname.ln_number,
lp->flags);
- return;
+ if (lp->lksb.sb_status == -EDEADLOCK &&
+ lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
+ lp->req = lp->cur;
+ acb.lc_ret |= LM_OUT_CONV_DEADLK;
+ if (lp->cur == DLM_LOCK_IV)
+ lp->lksb.sb_lkid = 0;
+ goto out;
+ } else
+ return;
}
/*
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index d3b8ce6fbbe..284a5ece8d9 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -140,7 +140,7 @@ static int nolock_hold_lvb(void *lock, char **lvbp)
struct nolock_lockspace *nl = lock;
int error = 0;
- *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
+ *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
if (!*lvbp)
error = -ENOMEM;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 161ab6f2058..548264b1836 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -769,8 +769,8 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
reserved = calc_reserved(sdp);
+ gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
- gfs2_assert_withdraw(sdp, unused >= 0);
atomic_add(unused, &sdp->sd_log_blks_free);
gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
sdp->sd_jdesc->jd_blocks);
@@ -779,6 +779,21 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
gfs2_log_unlock(sdp);
}
+static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+ struct list_head *head = &tr->tr_list_buf;
+ struct gfs2_bufdata *bd;
+
+ gfs2_log_lock(sdp);
+ while (!list_empty(head)) {
+ bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
+ list_del_init(&bd->bd_list_tr);
+ tr->tr_num_buf--;
+ }
+ gfs2_log_unlock(sdp);
+ gfs2_assert_warn(sdp, !tr->tr_num_buf);
+}
+
/**
* gfs2_log_commit - Commit a transaction to the log
* @sdp: the filesystem
@@ -790,7 +805,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
log_refund(sdp, tr);
- lops_incore_commit(sdp, tr);
+ buf_lo_incore_commit(sdp, tr);
sdp->sd_vfs->s_dirt = 1;
up_read(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index fae59d69d01..4390f6f4047 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -152,21 +152,6 @@ out:
unlock_buffer(bd->bd_bh);
}
-static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
-{
- struct list_head *head = &tr->tr_list_buf;
- struct gfs2_bufdata *bd;
-
- gfs2_log_lock(sdp);
- while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
- list_del_init(&bd->bd_list_tr);
- tr->tr_num_buf--;
- }
- gfs2_log_unlock(sdp);
- gfs2_assert_warn(sdp, !tr->tr_num_buf);
-}
-
static void buf_lo_before_commit(struct gfs2_sbd *sdp)
{
struct buffer_head *bh;
@@ -419,8 +404,10 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
error = gfs2_revoke_add(sdp, blkno, start);
- if (error < 0)
+ if (error < 0) {
+ brelse(bh);
return error;
+ }
else if (error)
sdp->sd_found_revokes++;
@@ -737,7 +724,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
const struct gfs2_log_operations gfs2_buf_lops = {
.lo_add = buf_lo_add,
- .lo_incore_commit = buf_lo_incore_commit,
.lo_before_commit = buf_lo_before_commit,
.lo_after_commit = buf_lo_after_commit,
.lo_before_scan = buf_lo_before_scan,
@@ -763,7 +749,6 @@ const struct gfs2_log_operations gfs2_rg_lops = {
const struct gfs2_log_operations gfs2_databuf_lops = {
.lo_add = databuf_lo_add,
- .lo_incore_commit = buf_lo_incore_commit,
.lo_before_commit = databuf_lo_before_commit,
.lo_after_commit = databuf_lo_after_commit,
.lo_scan_elements = databuf_lo_scan_elements,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 41a00df7558..3c0b2737658 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -57,15 +57,6 @@ static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
le->le_ops->lo_add(sdp, le);
}
-static inline void lops_incore_commit(struct gfs2_sbd *sdp,
- struct gfs2_trans *tr)
-{
- int x;
- for (x = 0; gfs2_log_ops[x]; x++)
- if (gfs2_log_ops[x]->lo_incore_commit)
- gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
-}
-
static inline void lops_before_commit(struct gfs2_sbd *sdp)
{
int x;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 9c7765c12d6..053e2ebbbd5 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -89,6 +89,12 @@ static int __init init_gfs2_fs(void)
if (!gfs2_bufdata_cachep)
goto fail;
+ gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd",
+ sizeof(struct gfs2_rgrpd),
+ 0, 0, NULL);
+ if (!gfs2_rgrpd_cachep)
+ goto fail;
+
error = register_filesystem(&gfs2_fs_type);
if (error)
goto fail;
@@ -108,6 +114,9 @@ fail_unregister:
fail:
gfs2_glock_exit();
+ if (gfs2_rgrpd_cachep)
+ kmem_cache_destroy(gfs2_rgrpd_cachep);
+
if (gfs2_bufdata_cachep)
kmem_cache_destroy(gfs2_bufdata_cachep);
@@ -133,6 +142,7 @@ static void __exit exit_gfs2_fs(void)
unregister_filesystem(&gfs2_fs_type);
unregister_filesystem(&gfs2meta_fs_type);
+ kmem_cache_destroy(gfs2_rgrpd_cachep);
kmem_cache_destroy(gfs2_bufdata_cachep);
kmem_cache_destroy(gfs2_inode_cachep);
kmem_cache_destroy(gfs2_glock_cachep);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index ac772b6d9db..90a04a6e378 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -21,7 +21,6 @@
#include <linux/gfs2_ondisk.h>
#include <linux/lm_interface.h>
#include <linux/backing-dev.h>
-#include <linux/pagevec.h>
#include "gfs2.h"
#include "incore.h"
@@ -104,11 +103,9 @@ static int gfs2_writepage_common(struct page *page,
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset;
- int ret = -EIO;
if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
goto out;
- ret = 0;
if (current->journal_info)
goto redirty;
/* Is the page fully outside i_size? (truncate in progress) */
@@ -280,7 +277,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
int i;
int ret;
- ret = gfs2_trans_begin(sdp, nrblocks, 0);
+ ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
if (ret < 0)
return ret;
@@ -510,23 +507,26 @@ static int __gfs2_readpage(void *file, struct page *page)
static int gfs2_readpage(struct file *file, struct page *page)
{
struct gfs2_inode *ip = GFS2_I(page->mapping->host);
- struct gfs2_holder gh;
+ struct gfs2_holder *gh;
int error;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
- error = gfs2_glock_nq_atime(&gh);
- if (unlikely(error)) {
+ gh = gfs2_glock_is_locked_by_me(ip->i_gl);
+ if (!gh) {
+ gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
+ if (!gh)
+ return -ENOBUFS;
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
unlock_page(page);
- goto out;
+ error = gfs2_glock_nq_atime(gh);
+ if (likely(error != 0))
+ goto out;
+ return AOP_TRUNCATED_PAGE;
}
error = __gfs2_readpage(file, page);
- gfs2_glock_dq(&gh);
+ gfs2_glock_dq(gh);
out:
- gfs2_holder_uninit(&gh);
- if (error == GLR_TRYFAILED) {
- yield();
- return AOP_TRUNCATED_PAGE;
- }
+ gfs2_holder_uninit(gh);
+ kfree(gh);
return error;
}
@@ -648,15 +648,15 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
if (alloc_required) {
al = gfs2_alloc_get(ip);
+ if (!al) {
+ error = -ENOMEM;
+ goto out_unlock;
+ }
- error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_lock_check(ip);
if (error)
goto out_alloc_put;
- error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
- if (error)
- goto out_qunlock;
-
al->al_requested = data_blocks + ind_blocks;
error = gfs2_inplace_reserve(ip);
if (error)
@@ -828,7 +828,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
unsigned int to = from + len;
int ret;
- BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0);
+ BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
ret = gfs2_meta_inode_buffer(ip, &dibh);
if (unlikely(ret)) {
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 793e334d098..4a5e676b442 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -43,7 +43,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
struct gfs2_holder d_gh;
struct gfs2_inode *ip = NULL;
int error;
- int had_lock=0;
+ int had_lock = 0;
if (inode) {
if (is_bad_inode(inode))
@@ -54,7 +54,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
if (sdp->sd_args.ar_localcaching)
goto valid;
- had_lock = gfs2_glock_is_locked_by_me(dip->i_gl);
+ had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
if (!had_lock) {
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
if (error)
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 334c7f85351..990d9f4bc46 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -204,8 +204,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
inum->no_addr,
0, 0);
- if (!inode)
- goto fail;
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto fail;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index f4842f2548c..e1b7d525a06 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -30,7 +30,6 @@
#include "glock.h"
#include "glops.h"
#include "inode.h"
-#include "lm.h"
#include "log.h"
#include "meta_io.h"
#include "quota.h"
@@ -39,6 +38,7 @@
#include "util.h"
#include "eaops.h"
#include "ops_address.h"
+#include "ops_inode.h"
/**
* gfs2_llseek - seek to a location in a file
@@ -369,12 +369,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
if (al == NULL)
goto out_unlock;
- ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ ret = gfs2_quota_lock_check(ip);
if (ret)
goto out_alloc_put;
- ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
- if (ret)
- goto out_quota_unlock;
al->al_requested = data_blocks + ind_blocks;
ret = gfs2_inplace_reserve(ip);
if (ret)
@@ -596,6 +593,36 @@ static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
return generic_setlease(file, arg, fl);
}
+static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
+ sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+ return error;
+}
+
+static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ struct file *file, int cmd, struct file_lock *fl)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_plock(
+ sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
+ return error;
+}
+
+static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_punlock(
+ sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+ return error;
+}
+
/**
* gfs2_lock - acquire/release a posix lock on a file
* @file: the file pointer
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4bee6aa845e..ef9c6c4f80f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -26,7 +26,6 @@
#include "glock.h"
#include "glops.h"
#include "inode.h"
-#include "lm.h"
#include "mount.h"
#include "ops_fstype.h"
#include "ops_dentry.h"
@@ -363,6 +362,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
return rc;
}
+static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+{
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
+ sdp->sd_lockstruct.ls_lockspace);
+}
+
static int init_journal(struct gfs2_sbd *sdp, int undo)
{
struct gfs2_holder ji_gh;
@@ -542,7 +548,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
}
ip = GFS2_I(sdp->sd_rindex);
set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
- sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
+ sdp->sd_rindex_uptodate = 0;
/* Read in the quota inode */
sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
@@ -705,6 +711,69 @@ fail:
}
/**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguements
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+ char *proto = sdp->sd_proto_name;
+ char *table = sdp->sd_table_name;
+ int flags = LM_MFLAG_CONV_NODROP;
+ int error;
+
+ if (sdp->sd_args.ar_spectator)
+ flags |= LM_MFLAG_SPECTATOR;
+
+ fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+
+ error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
+ gfs2_glock_cb, sdp,
+ GFS2_MIN_LVB_SIZE, flags,
+ &sdp->sd_lockstruct, &sdp->sd_kobj);
+ if (error) {
+ fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
+ proto, table, sdp->sd_args.ar_hostdata);
+ goto out;
+ }
+
+ if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+ gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+ gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
+ GFS2_MIN_LVB_SIZE)) {
+ gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+ goto out;
+ }
+
+ if (sdp->sd_args.ar_spectator)
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
+ else
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
+ sdp->sd_lockstruct.ls_jid);
+
+ fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+
+ if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
+ !sdp->sd_args.ar_ignore_local_fs) {
+ sdp->sd_args.ar_localflocks = 1;
+ sdp->sd_args.ar_localcaching = 1;
+ }
+
+out:
+ return error;
+}
+
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+}
+
+/**
* fill_super - Read in superblock
* @sb: The VFS superblock
* @data: Mount options
@@ -874,7 +943,6 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
{
struct kstat stat;
struct nameidata nd;
- struct file_system_type *fstype;
struct super_block *sb = NULL, *s;
int error;
@@ -886,8 +954,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
}
error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
- fstype = get_fs_type("gfs2");
- list_for_each_entry(s, &fstype->fs_supers, s_instances) {
+ list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
(S_ISDIR(stat.mode) &&
s == nd.path.dentry->d_inode->i_sb)) {
@@ -931,7 +998,6 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
error = PTR_ERR(new);
goto error;
}
- module_put(fs_type->owner);
new->s_flags = flags;
strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
sb_set_blocksize(new, sb->s_blocksize);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e87412902be..2686ad4c002 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -200,15 +200,15 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (alloc_required) {
struct gfs2_alloc *al = gfs2_alloc_get(dip);
+ if (!al) {
+ error = -ENOMEM;
+ goto out_gunlock;
+ }
- error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_lock_check(dip);
if (error)
goto out_alloc;
- error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
- if (error)
- goto out_gunlock_q;
-
al->al_requested = sdp->sd_max_dirres;
error = gfs2_inplace_reserve(dip);
@@ -716,15 +716,15 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (alloc_required) {
struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+ if (!al) {
+ error = -ENOMEM;
+ goto out_gunlock;
+ }
- error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_lock_check(ndip);
if (error)
goto out_alloc;
- error = gfs2_quota_check(ndip, ndip->i_inode.i_uid, ndip->i_inode.i_gid);
- if (error)
- goto out_gunlock_q;
-
al->al_requested = sdp->sd_max_dirres;
error = gfs2_inplace_reserve(ndip);
@@ -898,7 +898,7 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
int error;
int unlock = 0;
- if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+ if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
return error;
@@ -953,7 +953,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
ogid = ngid = NO_QUOTA_CHANGE;
- gfs2_alloc_get(ip);
+ if (!gfs2_alloc_get(ip))
+ return -ENOMEM;
error = gfs2_quota_lock(ip, nuid, ngid);
if (error)
@@ -981,8 +982,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
brelse(dibh);
if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
- gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
- gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
+ u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
+ gfs2_quota_change(ip, -blocks, ouid, ogid);
+ gfs2_quota_change(ip, blocks, nuid, ngid);
}
out_end_trans:
@@ -1064,7 +1066,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
int error;
int unlock = 0;
- if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+ if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
if (error)
return error;
@@ -1148,16 +1150,6 @@ const struct inode_operations gfs2_file_iops = {
.removexattr = gfs2_removexattr,
};
-const struct inode_operations gfs2_dev_iops = {
- .permission = gfs2_permission,
- .setattr = gfs2_setattr,
- .getattr = gfs2_getattr,
- .setxattr = gfs2_setxattr,
- .getxattr = gfs2_getxattr,
- .listxattr = gfs2_listxattr,
- .removexattr = gfs2_removexattr,
-};
-
const struct inode_operations gfs2_dir_iops = {
.create = gfs2_create,
.lookup = gfs2_lookup,
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index fd8cee231e1..14b4b797622 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -15,7 +15,6 @@
extern const struct inode_operations gfs2_file_iops;
extern const struct inode_operations gfs2_dir_iops;
extern const struct inode_operations gfs2_symlink_iops;
-extern const struct inode_operations gfs2_dev_iops;
extern const struct file_operations gfs2_file_fops;
extern const struct file_operations gfs2_dir_fops;
extern const struct file_operations gfs2_file_fops_nolock;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 5e524217944..2278c68b7e3 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -25,7 +25,6 @@
#include "incore.h"
#include "glock.h"
#include "inode.h"
-#include "lm.h"
#include "log.h"
#include "mount.h"
#include "ops_super.h"
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a08dabd6ce9..56aaf915c59 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -94,7 +94,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
struct gfs2_quota_data *qd;
int error;
- qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
+ qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
if (!qd)
return -ENOMEM;
@@ -616,16 +616,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
s64 value;
int err = -EIO;
- if (gfs2_is_stuffed(ip)) {
- struct gfs2_alloc *al = NULL;
- al = gfs2_alloc_get(ip);
- /* just request 1 blk */
- al->al_requested = 1;
- gfs2_inplace_reserve(ip);
+ if (gfs2_is_stuffed(ip))
gfs2_unstuff_dinode(ip, NULL);
- gfs2_inplace_release(ip);
- gfs2_alloc_put(ip);
- }
+
page = grab_cache_page(mapping, index);
if (!page)
return -ENOMEM;
@@ -690,14 +683,14 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
unsigned int qx, x;
struct gfs2_quota_data *qd;
loff_t offset;
- unsigned int nalloc = 0;
+ unsigned int nalloc = 0, blocks;
struct gfs2_alloc *al = NULL;
int error;
gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
&data_blocks, &ind_blocks);
- ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
+ ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
if (!ghs)
return -ENOMEM;
@@ -727,30 +720,33 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
nalloc++;
}
- if (nalloc) {
- al = gfs2_alloc_get(ip);
+ al = gfs2_alloc_get(ip);
+ if (!al) {
+ error = -ENOMEM;
+ goto out_gunlock;
+ }
+ /*
+ * 1 blk for unstuffing inode if stuffed. We add this extra
+ * block to the reservation unconditionally. If the inode
+ * doesn't need unstuffing, the block will be released to the
+ * rgrp since it won't be allocated during the transaction
+ */
+ al->al_requested = 1;
+ /* +1 in the end for block requested above for unstuffing */
+ blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
- al->al_requested = nalloc * (data_blocks + ind_blocks);
+ if (nalloc)
+ al->al_requested += nalloc * (data_blocks + ind_blocks);
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_alloc;
- error = gfs2_inplace_reserve(ip);
- if (error)
- goto out_alloc;
-
- error = gfs2_trans_begin(sdp,
- al->al_rgd->rd_length +
- num_qd * data_blocks +
- nalloc * ind_blocks +
- RES_DINODE + num_qd +
- RES_STATFS, 0);
- if (error)
- goto out_ipres;
- } else {
- error = gfs2_trans_begin(sdp,
- num_qd * data_blocks +
- RES_DINODE + num_qd, 0);
- if (error)
- goto out_gunlock;
- }
+ if (nalloc)
+ blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
+
+ error = gfs2_trans_begin(sdp, blocks, 0);
+ if (error)
+ goto out_ipres;
for (x = 0; x < num_qd; x++) {
qd = qda[x];
@@ -769,11 +765,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
out_end_trans:
gfs2_trans_end(sdp);
out_ipres:
- if (nalloc)
- gfs2_inplace_release(ip);
+ gfs2_inplace_release(ip);
out_alloc:
- if (nalloc)
- gfs2_alloc_put(ip);
+ gfs2_alloc_put(ip);
out_gunlock:
gfs2_glock_dq_uninit(&i_gh);
out:
@@ -1124,12 +1118,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
error = -ENOMEM;
sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
- sizeof(unsigned char *), GFP_KERNEL);
+ sizeof(unsigned char *), GFP_NOFS);
if (!sdp->sd_quota_bitmap)
return error;
for (x = 0; x < sdp->sd_quota_chunks; x++) {
- sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
if (!sdp->sd_quota_bitmap[x])
goto fail;
}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index a8be1417051..3b7f4b0e5df 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -32,4 +32,21 @@ int gfs2_quota_init(struct gfs2_sbd *sdp);
void gfs2_quota_scan(struct gfs2_sbd *sdp);
void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ int ret;
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return 0;
+ ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (ret)
+ return ret;
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+ return 0;
+ ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+ if (ret)
+ gfs2_quota_unlock(ip);
+ return ret;
+}
+
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 6fb07d67ca8..2888e4b4b1c 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -20,7 +20,6 @@
#include "bmap.h"
#include "glock.h"
#include "glops.h"
-#include "lm.h"
#include "lops.h"
#include "meta_io.h"
#include "recovery.h"
@@ -69,7 +68,7 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
return 0;
}
- rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
+ rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS);
if (!rr)
return -ENOMEM;
@@ -150,7 +149,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
struct gfs2_log_header_host *head)
{
struct buffer_head *bh;
- struct gfs2_log_header_host lh;
+ struct gfs2_log_header_host uninitialized_var(lh);
const u32 nothing = 0;
u32 hash;
int error;
@@ -425,6 +424,16 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
return error;
}
+
+static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+ unsigned int message)
+{
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_recovery_done(
+ sdp->sd_lockstruct.ls_lockspace, jid, message);
+}
+
+
/**
* gfs2_recover_journal - recovery a given journal
* @jd: the struct gfs2_jdesc describing the journal
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3552110b2e5..7e8f0b1d6c6 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,7 @@
#include <linux/fs.h>
#include <linux/gfs2_ondisk.h>
#include <linux/lm_interface.h>
+#include <linux/prefetch.h>
#include "gfs2.h"
#include "incore.h"
@@ -33,6 +34,16 @@
#define BFITNOENT ((u32)~0)
#define NO_BLOCK ((u64)~0)
+#if BITS_PER_LONG == 32
+#define LBITMASK (0x55555555UL)
+#define LBITSKIP55 (0x55555555UL)
+#define LBITSKIP00 (0x00000000UL)
+#else
+#define LBITMASK (0x5555555555555555UL)
+#define LBITSKIP55 (0x5555555555555555UL)
+#define LBITSKIP00 (0x0000000000000000UL)
+#endif
+
/*
* These routines are used by the resource group routines (rgrp.c)
* to keep track of block allocation. Each block is represented by two
@@ -53,7 +64,8 @@ static const char valid_change[16] = {
};
static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
- unsigned char old_state, unsigned char new_state);
+ unsigned char old_state, unsigned char new_state,
+ unsigned int *n);
/**
* gfs2_setbit - Set a bit in the bitmaps
@@ -64,26 +76,32 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
*
*/
-static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
- unsigned int buflen, u32 block,
- unsigned char new_state)
+static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
+ unsigned char *buf2, unsigned int offset,
+ unsigned int buflen, u32 block,
+ unsigned char new_state)
{
- unsigned char *byte, *end, cur_state;
- unsigned int bit;
+ unsigned char *byte1, *byte2, *end, cur_state;
+ const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
- byte = buffer + (block / GFS2_NBBY);
- bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
- end = buffer + buflen;
+ byte1 = buf1 + offset + (block / GFS2_NBBY);
+ end = buf1 + offset + buflen;
- gfs2_assert(rgd->rd_sbd, byte < end);
+ BUG_ON(byte1 >= end);
- cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+ cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
- if (valid_change[new_state * 4 + cur_state]) {
- *byte ^= cur_state << bit;
- *byte |= new_state << bit;
- } else
+ if (unlikely(!valid_change[new_state * 4 + cur_state])) {
gfs2_consist_rgrpd(rgd);
+ return;
+ }
+ *byte1 ^= (cur_state ^ new_state) << bit;
+
+ if (buf2) {
+ byte2 = buf2 + offset + (block / GFS2_NBBY);
+ cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
+ *byte2 ^= (cur_state ^ new_state) << bit;
+ }
}
/**
@@ -94,10 +112,12 @@ static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
*
*/
-static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
- unsigned int buflen, u32 block)
+static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
+ const unsigned char *buffer,
+ unsigned int buflen, u32 block)
{
- unsigned char *byte, *end, cur_state;
+ const unsigned char *byte, *end;
+ unsigned char cur_state;
unsigned int bit;
byte = buffer + (block / GFS2_NBBY);
@@ -126,47 +146,66 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
* Return: the block number (bitmap buffer scope) that was found
*/
-static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
- unsigned char old_state)
+static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
+ u8 old_state)
{
- unsigned char *byte;
- u32 blk = goal;
- unsigned int bit, bitlong;
- unsigned long *plong, plong55;
-
- byte = buffer + (goal / GFS2_NBBY);
- plong = (unsigned long *)(buffer + (goal / GFS2_NBBY));
- bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
- bitlong = bit;
-#if BITS_PER_LONG == 32
- plong55 = 0x55555555;
-#else
- plong55 = 0x5555555555555555;
-#endif
- while (byte < buffer + buflen) {
-
- if (bitlong == 0 && old_state == 0 && *plong == plong55) {
- plong++;
- byte += sizeof(unsigned long);
- blk += sizeof(unsigned long) * GFS2_NBBY;
- continue;
+ const u8 *byte, *start, *end;
+ int bit, startbit;
+ u32 g1, g2, misaligned;
+ unsigned long *plong;
+ unsigned long lskipval;
+
+ lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
+ g1 = (goal / GFS2_NBBY);
+ start = buffer + g1;
+ byte = start;
+ end = buffer + buflen;
+ g2 = ALIGN(g1, sizeof(unsigned long));
+ plong = (unsigned long *)(buffer + g2);
+ startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+ misaligned = g2 - g1;
+ if (!misaligned)
+ goto ulong_aligned;
+/* parse the bitmap a byte at a time */
+misaligned:
+ while (byte < end) {
+ if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
+ return goal +
+ (((byte - start) * GFS2_NBBY) +
+ ((bit - startbit) >> 1));
}
- if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
- return blk;
bit += GFS2_BIT_SIZE;
- if (bit >= 8) {
+ if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
bit = 0;
byte++;
+ misaligned--;
+ if (!misaligned) {
+ plong = (unsigned long *)byte;
+ goto ulong_aligned;
+ }
}
- bitlong += GFS2_BIT_SIZE;
- if (bitlong >= sizeof(unsigned long) * 8) {
- bitlong = 0;
- plong++;
- }
-
- blk++;
}
+ return BFITNOENT;
+/* parse the bitmap a unsigned long at a time */
+ulong_aligned:
+ /* Stop at "end - 1" or else prefetch can go past the end and segfault.
+ We could "if" it but we'd lose some of the performance gained.
+ This way will only slow down searching the very last 4/8 bytes
+ depending on architecture. I've experimented with several ways
+ of writing this section such as using an else before the goto
+ but this one seems to be the fastest. */
+ while ((unsigned char *)plong < end - 1) {
+ prefetch(plong + 1);
+ if (((*plong) & LBITMASK) != lskipval)
+ break;
+ plong++;
+ }
+ if ((unsigned char *)plong < end) {
+ byte = (const u8 *)plong;
+ misaligned += sizeof(unsigned long) - 1;
+ goto misaligned;
+ }
return BFITNOENT;
}
@@ -179,14 +218,14 @@ static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
* Returns: The number of bits
*/
-static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
- unsigned int buflen, unsigned char state)
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
+ unsigned int buflen, u8 state)
{
- unsigned char *byte = buffer;
- unsigned char *end = buffer + buflen;
- unsigned char state1 = state << 2;
- unsigned char state2 = state << 4;
- unsigned char state3 = state << 6;
+ const u8 *byte = buffer;
+ const u8 *end = buffer + buflen;
+ const u8 state1 = state << 2;
+ const u8 state2 = state << 4;
+ const u8 state3 = state << 6;
u32 count = 0;
for (; byte < end; byte++) {
@@ -353,7 +392,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
}
kfree(rgd->rd_bits);
- kfree(rgd);
+ kmem_cache_free(gfs2_rgrpd_cachep, rgd);
}
}
@@ -516,7 +555,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
return error;
}
- rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
+ rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
error = -ENOMEM;
if (!rgd)
return error;
@@ -539,7 +578,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
return error;
rgd->rd_gl->gl_object = rgd;
- rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
+ rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
rgd->rd_flags |= GFS2_RDF_CHECK;
return error;
}
@@ -575,7 +614,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
}
}
- sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+ sdp->sd_rindex_uptodate = 1;
return 0;
}
@@ -609,7 +648,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
}
}
- sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+ sdp->sd_rindex_uptodate = 1;
return 0;
}
@@ -642,9 +681,9 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
return error;
/* Read new copy from disk if we don't have the latest */
- if (sdp->sd_rindex_vn != gl->gl_vn) {
+ if (!sdp->sd_rindex_uptodate) {
mutex_lock(&sdp->sd_rindex_mutex);
- if (sdp->sd_rindex_vn != gl->gl_vn) {
+ if (!sdp->sd_rindex_uptodate) {
error = gfs2_ri_update(ip);
if (error)
gfs2_glock_dq_uninit(ri_gh);
@@ -655,21 +694,31 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
return error;
}
-static void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf)
+static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
{
const struct gfs2_rgrp *str = buf;
+ struct gfs2_rgrp_host *rg = &rgd->rd_rg;
+ u32 rg_flags;
- rg->rg_flags = be32_to_cpu(str->rg_flags);
+ rg_flags = be32_to_cpu(str->rg_flags);
+ if (rg_flags & GFS2_RGF_NOALLOC)
+ rgd->rd_flags |= GFS2_RDF_NOALLOC;
+ else
+ rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
rg->rg_free = be32_to_cpu(str->rg_free);
rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
}
-static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf)
+static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
{
struct gfs2_rgrp *str = buf;
+ struct gfs2_rgrp_host *rg = &rgd->rd_rg;
+ u32 rg_flags = 0;
- str->rg_flags = cpu_to_be32(rg->rg_flags);
+ if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+ rg_flags |= GFS2_RGF_NOALLOC;
+ str->rg_flags = cpu_to_be32(rg_flags);
str->rg_free = cpu_to_be32(rg->rg_free);
str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
str->__pad = cpu_to_be32(0);
@@ -726,9 +775,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
}
}
- if (rgd->rd_rg_vn != gl->gl_vn) {
- gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
- rgd->rd_rg_vn = gl->gl_vn;
+ if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
+ gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
+ rgd->rd_flags |= GFS2_RDF_UPTODATE;
}
spin_lock(&sdp->sd_rindex_spin);
@@ -840,7 +889,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
struct gfs2_sbd *sdp = rgd->rd_sbd;
int ret = 0;
- if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC)
+ if (rgd->rd_flags & GFS2_RDF_NOALLOC)
return 0;
spin_lock(&sdp->sd_rindex_spin);
@@ -866,13 +915,15 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
u32 goal = 0, block;
u64 no_addr;
struct gfs2_sbd *sdp = rgd->rd_sbd;
+ unsigned int n;
for(;;) {
if (goal >= rgd->rd_data)
break;
down_write(&sdp->sd_log_flush_lock);
+ n = 1;
block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
- GFS2_BLKST_UNLINKED);
+ GFS2_BLKST_UNLINKED, &n);
up_write(&sdp->sd_log_flush_lock);
if (block == BFITNOENT)
break;
@@ -904,24 +955,20 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
u64 rglast)
{
- struct gfs2_rgrpd *rgd = NULL;
+ struct gfs2_rgrpd *rgd;
spin_lock(&sdp->sd_rindex_spin);
- if (list_empty(&sdp->sd_rindex_recent_list))
- goto out;
-
- if (!rglast)
- goto first;
-
- list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
- if (rgd->rd_addr == rglast)
- goto out;
+ if (rglast) {
+ list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+ if (rgrp_contains_block(rgd, rglast))
+ goto out;
+ }
}
-
-first:
- rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
- rd_recent);
+ rgd = NULL;
+ if (!list_empty(&sdp->sd_rindex_recent_list))
+ rgd = list_entry(sdp->sd_rindex_recent_list.next,
+ struct gfs2_rgrpd, rd_recent);
out:
spin_unlock(&sdp->sd_rindex_spin);
return rgd;
@@ -1067,7 +1114,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
/* Try recently successful rgrps */
- rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
+ rgd = recent_rgrp_first(sdp, ip->i_goal);
while (rgd) {
rg_locked = 0;
@@ -1151,8 +1198,6 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
}
out:
- ip->i_last_rg_alloc = rgd->rd_addr;
-
if (begin) {
recent_rgrp_add(rgd);
rgd = gfs2_rgrpd_get_next(rgd);
@@ -1275,6 +1320,7 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
* @goal: the goal block within the RG (start here to search for avail block)
* @old_state: GFS2_BLKST_XXX the before-allocation state to find
* @new_state: GFS2_BLKST_XXX the after-allocation block state
+ * @n: The extent length
*
* Walk rgrp's bitmap to find bits that represent a block in @old_state.
* Add the found bitmap buffer to the transaction.
@@ -1290,13 +1336,17 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
*/
static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
- unsigned char old_state, unsigned char new_state)
+ unsigned char old_state, unsigned char new_state,
+ unsigned int *n)
{
struct gfs2_bitmap *bi = NULL;
- u32 length = rgd->rd_length;
+ const u32 length = rgd->rd_length;
u32 blk = 0;
unsigned int buf, x;
+ const unsigned int elen = *n;
+ const u8 *buffer;
+ *n = 0;
/* Find bitmap block that contains bits for goal block */
for (buf = 0; buf < length; buf++) {
bi = rgd->rd_bits + buf;
@@ -1317,12 +1367,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
for (x = 0; x <= length; x++) {
/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
bitmaps, so we must search the originals for that. */
+ buffer = bi->bi_bh->b_data + bi->bi_offset;
if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
- blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset,
- bi->bi_len, goal, old_state);
- else
- blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
- bi->bi_len, goal, old_state);
+ buffer = bi->bi_clone + bi->bi_offset;
+
+ blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state);
if (blk != BFITNOENT)
break;
@@ -1333,12 +1382,23 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
}
if (blk != BFITNOENT && old_state != new_state) {
+ *n = 1;
gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
- gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+ gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
bi->bi_len, blk, new_state);
- if (bi->bi_clone)
- gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
- bi->bi_len, blk, new_state);
+ goal = blk;
+ while (*n < elen) {
+ goal++;
+ if (goal >= (bi->bi_len * GFS2_NBBY))
+ break;
+ if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+ GFS2_BLKST_FREE)
+ break;
+ gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
+ bi->bi_offset, bi->bi_len, goal,
+ new_state);
+ (*n)++;
+ }
}
return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
@@ -1393,7 +1453,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
bi->bi_len);
}
gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
- gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+ gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
bi->bi_len, buf_blk, new_state);
}
@@ -1401,13 +1461,13 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
}
/**
- * gfs2_alloc_data - Allocate a data block
- * @ip: the inode to allocate the data block for
+ * gfs2_alloc_block - Allocate a block
+ * @ip: the inode to allocate the block for
*
* Returns: the allocated block
*/
-u64 gfs2_alloc_data(struct gfs2_inode *ip)
+u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al = ip->i_alloc;
@@ -1415,77 +1475,31 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
u32 goal, blk;
u64 block;
- if (rgrp_contains_block(rgd, ip->i_di.di_goal_data))
- goal = ip->i_di.di_goal_data - rgd->rd_data0;
+ if (rgrp_contains_block(rgd, ip->i_goal))
+ goal = ip->i_goal - rgd->rd_data0;
else
- goal = rgd->rd_last_alloc_data;
+ goal = rgd->rd_last_alloc;
- blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+ blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
BUG_ON(blk == BFITNOENT);
- rgd->rd_last_alloc_data = blk;
+ rgd->rd_last_alloc = blk;
block = rgd->rd_data0 + blk;
- ip->i_di.di_goal_data = block;
+ ip->i_goal = block;
- gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
- rgd->rd_rg.rg_free--;
+ gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
+ rgd->rd_rg.rg_free -= *n;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
- gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+ gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
- al->al_alloced++;
+ al->al_alloced += *n;
- gfs2_statfs_change(sdp, 0, -1, 0);
- gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
+ gfs2_statfs_change(sdp, 0, -*n, 0);
+ gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
spin_lock(&sdp->sd_rindex_spin);
- rgd->rd_free_clone--;
- spin_unlock(&sdp->sd_rindex_spin);
-
- return block;
-}
-
-/**
- * gfs2_alloc_meta - Allocate a metadata block
- * @ip: the inode to allocate the metadata block for
- *
- * Returns: the allocated block
- */
-
-u64 gfs2_alloc_meta(struct gfs2_inode *ip)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = ip->i_alloc;
- struct gfs2_rgrpd *rgd = al->al_rgd;
- u32 goal, blk;
- u64 block;
-
- if (rgrp_contains_block(rgd, ip->i_di.di_goal_meta))
- goal = ip->i_di.di_goal_meta - rgd->rd_data0;
- else
- goal = rgd->rd_last_alloc_meta;
-
- blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
- BUG_ON(blk == BFITNOENT);
- rgd->rd_last_alloc_meta = blk;
-
- block = rgd->rd_data0 + blk;
- ip->i_di.di_goal_meta = block;
-
- gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
- rgd->rd_rg.rg_free--;
-
- gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
- gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
-
- al->al_alloced++;
-
- gfs2_statfs_change(sdp, 0, -1, 0);
- gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
- gfs2_trans_add_unrevoke(sdp, block);
-
- spin_lock(&sdp->sd_rindex_spin);
- rgd->rd_free_clone--;
+ rgd->rd_free_clone -= *n;
spin_unlock(&sdp->sd_rindex_spin);
return block;
@@ -1505,12 +1519,13 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
struct gfs2_rgrpd *rgd = al->al_rgd;
u32 blk;
u64 block;
+ unsigned int n = 1;
- blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
- GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
+ blk = rgblk_search(rgd, rgd->rd_last_alloc,
+ GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
BUG_ON(blk == BFITNOENT);
- rgd->rd_last_alloc_meta = blk;
+ rgd->rd_last_alloc = blk;
block = rgd->rd_data0 + blk;
@@ -1519,12 +1534,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
rgd->rd_rg.rg_dinodes++;
*generation = rgd->rd_rg.rg_igeneration++;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
- gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+ gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
al->al_alloced++;
gfs2_statfs_change(sdp, 0, -1, +1);
- gfs2_trans_add_unrevoke(sdp, block);
+ gfs2_trans_add_unrevoke(sdp, block, 1);
spin_lock(&sdp->sd_rindex_spin);
rgd->rd_free_clone--;
@@ -1553,7 +1568,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
rgd->rd_rg.rg_free += blen;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
- gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+ gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_trans_add_rg(rgd);
@@ -1581,7 +1596,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
rgd->rd_rg.rg_free += blen;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
- gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+ gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_trans_add_rg(rgd);
@@ -1601,7 +1616,7 @@ void gfs2_unlink_di(struct inode *inode)
if (!rgd)
return;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
- gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+ gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_trans_add_rg(rgd);
}
@@ -1621,7 +1636,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
rgd->rd_rg.rg_free++;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
- gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+ gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_statfs_change(sdp, 0, +1, -1);
gfs2_trans_add_rg(rgd);
@@ -1699,8 +1714,7 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
*
*/
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
- int flags)
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
{
unsigned int x;
@@ -1708,7 +1722,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
GFP_NOFS | __GFP_NOFAIL);
for (x = 0; x < rlist->rl_rgrps; x++)
gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
- state, flags,
+ state, 0,
&rlist->rl_ghs[x]);
}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 149bb161f4b..3181c7e624b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -46,8 +46,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip);
unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-u64 gfs2_alloc_data(struct gfs2_inode *ip);
-u64 gfs2_alloc_meta(struct gfs2_inode *ip);
+u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -64,8 +63,7 @@ struct gfs2_rgrp_list {
void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
- int flags);
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
u64 gfs2_ri_total(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ef0562c3bc7..7aeacbc65f3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -210,7 +210,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
struct page *page;
struct bio *bio;
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_NOFS);
if (unlikely(!page))
return -ENOBUFS;
@@ -218,7 +218,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
ClearPageDirty(page);
lock_page(page);
- bio = bio_alloc(GFP_KERNEL, 1);
+ bio = bio_alloc(GFP_NOFS, 1);
if (unlikely(!bio)) {
__free_page(page);
return -ENOBUFS;
@@ -316,6 +316,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
sdp->sd_heightsize[x] = space;
}
sdp->sd_max_height = x;
+ sdp->sd_heightsize[x] = ~0;
gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
@@ -334,6 +335,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
sdp->sd_jheightsize[x] = space;
}
sdp->sd_max_jheight = x;
+ sdp->sd_jheightsize[x] = ~0;
gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
return 0;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 60a870e430b..44361ecc44f 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -17,6 +17,7 @@ void gfs2_tune_init(struct gfs2_tune *gt);
int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
{
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index eaa3b7b2f99..9ab9fc85ecd 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -20,7 +20,6 @@
#include "gfs2.h"
#include "incore.h"
-#include "lm.h"
#include "sys.h"
#include "super.h"
#include "glock.h"
@@ -328,15 +327,9 @@ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
} \
static struct counters_attr counters_attr_##name = __ATTR_RO(name)
-COUNTERS_ATTR(glock_count, "%u\n");
-COUNTERS_ATTR(glock_held_count, "%u\n");
-COUNTERS_ATTR(inode_count, "%u\n");
COUNTERS_ATTR(reclaimed, "%u\n");
static struct attribute *counters_attrs[] = {
- &counters_attr_glock_count.attr,
- &counters_attr_glock_held_count.attr,
- &counters_attr_inode_count.attr,
&counters_attr_reclaimed.attr,
NULL,
};
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 73e5d92a657..f677b8a83f0 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -146,30 +146,25 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
lops_add(sdp, &bd->bd_le);
}
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
{
- struct gfs2_bufdata *bd;
- int found = 0;
+ struct gfs2_bufdata *bd, *tmp;
+ struct gfs2_trans *tr = current->journal_info;
+ unsigned int n = len;
gfs2_log_lock(sdp);
-
- list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) {
- if (bd->bd_blkno == blkno) {
+ list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) {
+ if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) {
list_del_init(&bd->bd_le.le_list);
gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
sdp->sd_log_num_revoke--;
- found = 1;
- break;
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
+ tr->tr_num_revoke_rm++;
+ if (--n == 0)
+ break;
}
}
-
gfs2_log_unlock(sdp);
-
- if (found) {
- struct gfs2_trans *tr = current->journal_info;
- kmem_cache_free(gfs2_bufdata_cachep, bd);
- tr->tr_num_revoke_rm++;
- }
}
void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index e826f0dab80..edf9d4bd908 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -32,7 +32,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp);
void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 424a0774eda..d31e355c61f 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -19,12 +19,12 @@
#include "gfs2.h"
#include "incore.h"
#include "glock.h"
-#include "lm.h"
#include "util.h"
struct kmem_cache *gfs2_glock_cachep __read_mostly;
struct kmem_cache *gfs2_inode_cachep __read_mostly;
struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
+struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
void gfs2_assert_i(struct gfs2_sbd *sdp)
{
@@ -32,6 +32,28 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
sdp->sd_fsname);
}
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+{
+ va_list args;
+
+ if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ return 0;
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+
+ fs_err(sdp, "about to withdraw this file system\n");
+ BUG_ON(sdp->sd_args.ar_debug);
+
+ fs_err(sdp, "telling LM to withdraw\n");
+ gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+ fs_err(sdp, "withdrawn\n");
+ dump_stack();
+
+ return -1;
+}
+
/**
* gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
* Returns: -1 if this call withdrew the machine,
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 28938a46cf4..509c5d60bd8 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -147,6 +147,7 @@ gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
extern struct kmem_cache *gfs2_glock_cachep;
extern struct kmem_cache *gfs2_inode_cachep;
extern struct kmem_cache *gfs2_bufdata_cachep;
+extern struct kmem_cache *gfs2_rgrpd_cachep;
static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
unsigned int *p)
@@ -163,6 +164,7 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
unsigned int bit, int new_value);
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
#endif /* __UTIL_DOT_H__ */
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index b60c0affbec..f457d2ca51a 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/capability.h>
#include <linux/fs.h>
+#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/xattr.h>
#include <asm/uaccess.h>
@@ -35,25 +36,32 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
return put_user(flags, (int __user *)arg);
case HFSPLUS_IOC_EXT2_SETFLAGS: {
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!is_owner_or_cap(inode))
- return -EACCES;
-
- if (get_user(flags, (int __user *)arg))
- return -EFAULT;
-
+ int err = 0;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setflags_out;
+ }
+ if (get_user(flags, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setflags_out;
+ }
if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
- if (!capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
}
/* don't silently ignore unsupported ext2 flags */
- if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL))
- return -EOPNOTSUPP;
-
+ if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+ err = -EOPNOTSUPP;
+ goto setflags_out;
+ }
if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
inode->i_flags |= S_IMMUTABLE;
HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -75,7 +83,9 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
}
default:
return -ENOTTY;
diff --git a/fs/inode.c b/fs/inode.c
index 53245ffcf93..27ee1af50d0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1199,42 +1199,37 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
struct timespec now;
- if (inode->i_flags & S_NOATIME)
+ if (mnt_want_write(mnt))
return;
+ if (inode->i_flags & S_NOATIME)
+ goto out;
if (IS_NOATIME(inode))
- return;
+ goto out;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
+ goto out;
- /*
- * We may have a NULL vfsmount when coming from NFSD
- */
- if (mnt) {
- if (mnt->mnt_flags & MNT_NOATIME)
- return;
- if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
-
- if (mnt->mnt_flags & MNT_RELATIME) {
- /*
- * With relative atime, only update atime if the
- * previous atime is earlier than either the ctime or
- * mtime.
- */
- if (timespec_compare(&inode->i_mtime,
- &inode->i_atime) < 0 &&
- timespec_compare(&inode->i_ctime,
- &inode->i_atime) < 0)
- return;
- }
+ if (mnt->mnt_flags & MNT_NOATIME)
+ goto out;
+ if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+ goto out;
+ if (mnt->mnt_flags & MNT_RELATIME) {
+ /*
+ * With relative atime, only update atime if the previous
+ * atime is earlier than either the ctime or mtime.
+ */
+ if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 &&
+ timespec_compare(&inode->i_ctime, &inode->i_atime) < 0)
+ goto out;
}
now = current_fs_time(inode->i_sb);
if (timespec_equal(&inode->i_atime, &now))
- return;
+ goto out;
inode->i_atime = now;
mark_inode_dirty_sync(inode);
+out:
+ mnt_drop_write(mnt);
}
EXPORT_SYMBOL(touch_atime);
@@ -1255,10 +1250,13 @@ void file_update_time(struct file *file)
struct inode *inode = file->f_path.dentry->d_inode;
struct timespec now;
int sync_it = 0;
+ int err;
if (IS_NOCMTIME(inode))
return;
- if (IS_RDONLY(inode))
+
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
return;
now = current_fs_time(inode->i_sb);
@@ -1279,6 +1277,7 @@ void file_update_time(struct file *file)
if (sync_it)
mark_inode_dirty_sync(inode);
+ mnt_drop_write(file->f_path.mnt);
}
EXPORT_SYMBOL(file_update_time);
diff --git a/fs/internal.h b/fs/internal.h
index 392e8ccd6fc..80aa9a02337 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,3 +43,14 @@ extern void __init chrdev_init(void);
* namespace.c
*/
extern int copy_mount_options(const void __user *, unsigned long *);
+
+extern void free_vfsmnt(struct vfsmount *);
+extern struct vfsmount *alloc_vfsmnt(const char *);
+extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
+extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
+ struct vfsmount *);
+extern void release_mounts(struct list_head *);
+extern void umount_tree(struct vfsmount *, int, struct list_head *);
+extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+
+extern void __init mnt_init(void);
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 0b78fdc9773..a841f4973a7 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -15,7 +15,7 @@
#include <linux/version.h>
#include <linux/rbtree.h>
#include <linux/posix_acl.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
struct jffs2_inode_info {
/* We need an internal mutex similar to inode->i_mutex.
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 3a2197f3c81..18fca2b9e53 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -16,7 +16,7 @@
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/completion.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/list.h>
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index a1f8e375ad2..afe222bf300 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/ctype.h>
#include <linux/capability.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/sched.h>
#include <asm/current.h>
@@ -65,23 +66,30 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return put_user(flags, (int __user *) arg);
case JFS_IOC_SETFLAGS: {
unsigned int oldflags;
+ int err;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EACCES;
-
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setflags_out;
+ }
+ if (get_user(flags, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setflags_out;
+ }
flags = jfs_map_ext2(flags, 1);
if (!S_ISDIR(inode->i_mode))
flags &= ~JFS_DIRSYNC_FL;
/* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode))
- return -EPERM;
+ if (IS_NOQUOTA(inode)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
/* Lock against other parallel changes of flags */
mutex_lock(&inode->i_mutex);
@@ -98,7 +106,8 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto setflags_out;
}
}
@@ -110,7 +119,9 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
mutex_unlock(&inode->i_mutex);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
}
default:
return -ENOTTY;
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index e1985066b1c..2bc7d8aa574 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2172,7 +2172,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
}
/* update the free count for this dmap */
- dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+ le32_add_cpu(&dp->nfree, -nblocks);
BMAP_LOCK(bmp);
@@ -2316,7 +2316,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
/* update the free count for this dmap.
*/
- dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+ le32_add_cpu(&dp->nfree, nblocks);
BMAP_LOCK(bmp);
@@ -3226,7 +3226,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
}
/* update the free count for this dmap */
- dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+ le32_add_cpu(&dp->nfree, -nblocks);
/* reconstruct summary tree */
dbInitDmapTree(dp);
@@ -3660,9 +3660,8 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
goto initTree;
}
} else {
- dp->nblocks =
- cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
- dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+ le32_add_cpu(&dp->nblocks, nblocks);
+ le32_add_cpu(&dp->nfree, nblocks);
}
/* word number containing start block number */
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 11e6d471b36..1a6eb41569b 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -61,7 +61,7 @@
* determine the maximum free string for four (lower level) nodes
* of the tree.
*/
-static __inline signed char TREEMAX(signed char *cp)
+static inline signed char TREEMAX(signed char *cp)
{
signed char tmp1, tmp2;
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 9bf29f77173..734ec916bea 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1019,8 +1019,7 @@ int diFree(struct inode *ip)
/* update the free inode counts at the iag, ag and
* map level.
*/
- iagp->nfreeinos =
- cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
+ le32_add_cpu(&iagp->nfreeinos, 1);
imap->im_agctl[agno].numfree += 1;
atomic_inc(&imap->im_numfree);
@@ -1219,9 +1218,8 @@ int diFree(struct inode *ip)
/* update the number of free inodes and number of free extents
* for the iag.
*/
- iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) -
- (INOSPEREXT - 1));
- iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
+ le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1));
+ le32_add_cpu(&iagp->nfreeexts, 1);
/* update the number of free inodes and backed inodes
* at the ag and inode map level.
@@ -2124,7 +2122,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
/* update the free inode count at the iag, ag, inode
* map levels.
*/
- iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1);
+ le32_add_cpu(&iagp->nfreeinos, -1);
imap->im_agctl[agno].numfree -= 1;
atomic_dec(&imap->im_numfree);
@@ -2378,9 +2376,8 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
/* update the free inode and free extent counts for the
* iag.
*/
- iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) +
- (INOSPEREXT - 1));
- iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
+ le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1));
+ le32_add_cpu(&iagp->nfreeexts, -1);
/* update the free and backed inode counts for the ag.
*/
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index a000aaa7513..5a61ebf2cbc 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -905,8 +905,7 @@ int xtInsert(tid_t tid, /* transaction id */
XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
/* advance next available entry index */
- p->header.nextindex =
- cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+ le16_add_cpu(&p->header.nextindex, 1);
/* Don't log it if there are no links to the file */
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -997,8 +996,7 @@ xtSplitUp(tid_t tid,
split->addr);
/* advance next available entry index */
- sp->header.nextindex =
- cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1);
+ le16_add_cpu(&sp->header.nextindex, 1);
/* Don't log it if there are no links to the file */
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1167,9 +1165,7 @@ xtSplitUp(tid_t tid,
JFS_SBI(ip->i_sb)->nbperpage, rcbn);
/* advance next available entry index. */
- sp->header.nextindex =
- cpu_to_le16(le16_to_cpu(sp->header.nextindex) +
- 1);
+ le16_add_cpu(&sp->header.nextindex, 1);
/* Don't log it if there are no links to the file */
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1738,8 +1734,7 @@ int xtExtend(tid_t tid, /* transaction id */
XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
/* advance next available entry index */
- p->header.nextindex =
- cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+ le16_add_cpu(&p->header.nextindex, 1);
}
/* get back old entry */
@@ -1905,8 +1900,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
/* advance next available entry index */
- p->header.nextindex =
- cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+ le16_add_cpu(&p->header.nextindex, 1);
}
/* get back old XAD */
@@ -2567,8 +2561,7 @@ int xtAppend(tid_t tid, /* transaction id */
XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
/* advance next available entry index */
- p->header.nextindex =
- cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+ le16_add_cpu(&p->header.nextindex, 1);
xtlck->lwm.offset =
(xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
@@ -2631,8 +2624,7 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
* delete the entry from the leaf page
*/
nextindex = le16_to_cpu(p->header.nextindex);
- p->header.nextindex =
- cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1);
+ le16_add_cpu(&p->header.nextindex, -1);
/*
* if the leaf page bocome empty, free the page
@@ -2795,9 +2787,7 @@ xtDeleteUp(tid_t tid, struct inode *ip,
(nextindex - index -
1) << L2XTSLOTSIZE);
- p->header.nextindex =
- cpu_to_le16(le16_to_cpu(p->header.nextindex) -
- 1);
+ le16_add_cpu(&p->header.nextindex, -1);
jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
(ulong) parent->bn, index);
}
diff --git a/fs/locks.c b/fs/locks.c
index 43c0af21a0c..592faadbcec 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,7 +127,6 @@
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
-#include <asm/semaphore.h>
#include <asm/uaccess.h>
#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
diff --git a/fs/namei.c b/fs/namei.c
index 8cf9bb9c2fc..e179f71bfcb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
return -EACCES;
flag &= ~O_TRUNC;
- } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
- return -EROFS;
+ }
error = vfs_permission(nd, acc_mode);
if (error)
@@ -1677,7 +1676,12 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
return 0;
}
-static int open_namei_create(struct nameidata *nd, struct path *path,
+/*
+ * Be careful about ever adding any more callers of this
+ * function. Its flags must be in the namei format, not
+ * what get passed to sys_open().
+ */
+static int __open_namei_create(struct nameidata *nd, struct path *path,
int flag, int mode)
{
int error;
@@ -1696,26 +1700,56 @@ static int open_namei_create(struct nameidata *nd, struct path *path,
}
/*
- * open_namei()
+ * Note that while the flag value (low two bits) for sys_open means:
+ * 00 - read-only
+ * 01 - write-only
+ * 10 - read-write
+ * 11 - special
+ * it is changed into
+ * 00 - no permissions needed
+ * 01 - read-permission
+ * 10 - write-permission
+ * 11 - read-write
+ * for the internal routines (ie open_namei()/follow_link() etc)
+ * This is more logical, and also allows the 00 "no perm needed"
+ * to be used for symlinks (where the permissions are checked
+ * later).
*
- * namei for open - this is in fact almost the whole open-routine.
- *
- * Note that the low bits of "flag" aren't the same as in the open
- * system call - they are 00 - no permissions needed
- * 01 - read permission needed
- * 10 - write permission needed
- * 11 - read/write permissions needed
- * which is a lot more logical, and also allows the "no perm" needed
- * for symlinks (where the permissions are checked later).
- * SMP-safe
+*/
+static inline int open_to_namei_flags(int flag)
+{
+ if ((flag+1) & O_ACCMODE)
+ flag++;
+ return flag;
+}
+
+static int open_will_write_to_fs(int flag, struct inode *inode)
+{
+ /*
+ * We'll never write to the fs underlying
+ * a device file.
+ */
+ if (special_file(inode->i_mode))
+ return 0;
+ return (flag & O_TRUNC);
+}
+
+/*
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
*/
-int open_namei(int dfd, const char *pathname, int flag,
- int mode, struct nameidata *nd)
+struct file *do_filp_open(int dfd, const char *pathname,
+ int open_flag, int mode)
{
+ struct file *filp;
+ struct nameidata nd;
int acc_mode, error;
struct path path;
struct dentry *dir;
int count = 0;
+ int will_write;
+ int flag = open_to_namei_flags(open_flag);
acc_mode = ACC_MODE(flag);
@@ -1733,18 +1767,19 @@ int open_namei(int dfd, const char *pathname, int flag,
*/
if (!(flag & O_CREAT)) {
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
- nd, flag);
+ &nd, flag);
if (error)
- return error;
+ return ERR_PTR(error);
goto ok;
}
/*
* Create - we need to know the parent.
*/
- error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
+ error = path_lookup_create(dfd, pathname, LOOKUP_PARENT,
+ &nd, flag, mode);
if (error)
- return error;
+ return ERR_PTR(error);
/*
* We have the parent and last component. First of all, check
@@ -1752,14 +1787,14 @@ int open_namei(int dfd, const char *pathname, int flag,
* will not do.
*/
error = -EISDIR;
- if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
+ if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
goto exit;
- dir = nd->path.dentry;
- nd->flags &= ~LOOKUP_PARENT;
+ dir = nd.path.dentry;
+ nd.flags &= ~LOOKUP_PARENT;
mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(nd);
- path.mnt = nd->path.mnt;
+ path.dentry = lookup_hash(&nd);
+ path.mnt = nd.path.mnt;
do_last:
error = PTR_ERR(path.dentry);
@@ -1768,18 +1803,31 @@ do_last:
goto exit;
}
- if (IS_ERR(nd->intent.open.file)) {
- mutex_unlock(&dir->d_inode->i_mutex);
- error = PTR_ERR(nd->intent.open.file);
- goto exit_dput;
+ if (IS_ERR(nd.intent.open.file)) {
+ error = PTR_ERR(nd.intent.open.file);
+ goto exit_mutex_unlock;
}
/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
- error = open_namei_create(nd, &path, flag, mode);
+ /*
+ * This write is needed to ensure that a
+ * ro->rw transition does not occur between
+ * the time when the file is created and when
+ * a permanent write count is taken through
+ * the 'struct file' in nameidata_to_filp().
+ */
+ error = mnt_want_write(nd.path.mnt);
if (error)
+ goto exit_mutex_unlock;
+ error = __open_namei_create(&nd, &path, flag, mode);
+ if (error) {
+ mnt_drop_write(nd.path.mnt);
goto exit;
- return 0;
+ }
+ filp = nameidata_to_filp(&nd, open_flag);
+ mnt_drop_write(nd.path.mnt);
+ return filp;
}
/*
@@ -1804,23 +1852,52 @@ do_last:
if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
goto do_link;
- path_to_nameidata(&path, nd);
+ path_to_nameidata(&path, &nd);
error = -EISDIR;
if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
goto exit;
ok:
- error = may_open(nd, acc_mode, flag);
- if (error)
+ /*
+ * Consider:
+ * 1. may_open() truncates a file
+ * 2. a rw->ro mount transition occurs
+ * 3. nameidata_to_filp() fails due to
+ * the ro mount.
+ * That would be inconsistent, and should
+ * be avoided. Taking this mnt write here
+ * ensures that (2) can not occur.
+ */
+ will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
+ if (will_write) {
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto exit;
+ }
+ error = may_open(&nd, acc_mode, flag);
+ if (error) {
+ if (will_write)
+ mnt_drop_write(nd.path.mnt);
goto exit;
- return 0;
+ }
+ filp = nameidata_to_filp(&nd, open_flag);
+ /*
+ * It is now safe to drop the mnt write
+ * because the filp has had a write taken
+ * on its behalf.
+ */
+ if (will_write)
+ mnt_drop_write(nd.path.mnt);
+ return filp;
+exit_mutex_unlock:
+ mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
- path_put_conditional(&path, nd);
+ path_put_conditional(&path, &nd);
exit:
- if (!IS_ERR(nd->intent.open.file))
- release_open_intent(nd);
- path_put(&nd->path);
- return error;
+ if (!IS_ERR(nd.intent.open.file))
+ release_open_intent(&nd);
+ path_put(&nd.path);
+ return ERR_PTR(error);
do_link:
error = -ELOOP;
@@ -1836,43 +1913,60 @@ do_link:
* stored in nd->last.name and we will have to putname() it when we
* are done. Procfs-like symlinks just set LAST_BIND.
*/
- nd->flags |= LOOKUP_PARENT;
- error = security_inode_follow_link(path.dentry, nd);
+ nd.flags |= LOOKUP_PARENT;
+ error = security_inode_follow_link(path.dentry, &nd);
if (error)
goto exit_dput;
- error = __do_follow_link(&path, nd);
+ error = __do_follow_link(&path, &nd);
if (error) {
/* Does someone understand code flow here? Or it is only
* me so stupid? Anathema to whoever designed this non-sense
* with "intent.open".
*/
- release_open_intent(nd);
- return error;
+ release_open_intent(&nd);
+ return ERR_PTR(error);
}
- nd->flags &= ~LOOKUP_PARENT;
- if (nd->last_type == LAST_BIND)
+ nd.flags &= ~LOOKUP_PARENT;
+ if (nd.last_type == LAST_BIND)
goto ok;
error = -EISDIR;
- if (nd->last_type != LAST_NORM)
+ if (nd.last_type != LAST_NORM)
goto exit;
- if (nd->last.name[nd->last.len]) {
- __putname(nd->last.name);
+ if (nd.last.name[nd.last.len]) {
+ __putname(nd.last.name);
goto exit;
}
error = -ELOOP;
if (count++==32) {
- __putname(nd->last.name);
+ __putname(nd.last.name);
goto exit;
}
- dir = nd->path.dentry;
+ dir = nd.path.dentry;
mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(nd);
- path.mnt = nd->path.mnt;
- __putname(nd->last.name);
+ path.dentry = lookup_hash(&nd);
+ path.mnt = nd.path.mnt;
+ __putname(nd.last.name);
goto do_last;
}
/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename: path to open
+ * @flags: open flags as per the open(2) second argument
+ * @mode: mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to. But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+ return do_filp_open(AT_FDCWD, filename, flags, mode);
+}
+EXPORT_SYMBOL(filp_open);
+
+/**
* lookup_create - lookup a dentry, creating it if it doesn't exist
* @nd: nameidata info
* @is_dir: directory flag
@@ -1945,6 +2039,23 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
return error;
}
+static int may_mknod(mode_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ case 0: /* zero mode translates to S_IFREG */
+ return 0;
+ case S_IFDIR:
+ return -EPERM;
+ default:
+ return -EINVAL;
+ }
+}
+
asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
unsigned dev)
{
@@ -1963,12 +2074,19 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
if (error)
goto out;
dentry = lookup_create(&nd, 0);
- error = PTR_ERR(dentry);
-
+ if (IS_ERR(dentry)) {
+ error = PTR_ERR(dentry);
+ goto out_unlock;
+ }
if (!IS_POSIXACL(nd.path.dentry->d_inode))
mode &= ~current->fs->umask;
- if (!IS_ERR(dentry)) {
- switch (mode & S_IFMT) {
+ error = may_mknod(mode);
+ if (error)
+ goto out_dput;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
+ switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
break;
@@ -1979,14 +2097,11 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
case S_IFIFO: case S_IFSOCK:
error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
break;
- case S_IFDIR:
- error = -EPERM;
- break;
- default:
- error = -EINVAL;
- }
- dput(dentry);
}
+ mnt_drop_write(nd.path.mnt);
+out_dput:
+ dput(dentry);
+out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
path_put(&nd.path);
out:
@@ -2044,7 +2159,12 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
if (!IS_POSIXACL(nd.path.dentry->d_inode))
mode &= ~current->fs->umask;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+ mnt_drop_write(nd.path.mnt);
+out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2151,7 +2271,12 @@ static long do_rmdir(int dfd, const char __user *pathname)
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit2;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto exit3;
error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+ mnt_drop_write(nd.path.mnt);
+exit3:
dput(dentry);
exit2:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2232,7 +2357,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
inode = dentry->d_inode;
if (inode)
atomic_inc(&inode->i_count);
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto exit2;
error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+ mnt_drop_write(nd.path.mnt);
exit2:
dput(dentry);
}
@@ -2313,7 +2442,12 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
if (IS_ERR(dentry))
goto out_unlock;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
+ mnt_drop_write(nd.path.mnt);
+out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2408,7 +2542,12 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto out_unlock;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
+ mnt_drop_write(nd.path.mnt);
+out_dput:
dput(new_dentry);
out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2634,8 +2773,12 @@ static int do_rename(int olddfd, const char *oldname,
if (new_dentry == trap)
goto exit5;
+ error = mnt_want_write(oldnd.path.mnt);
+ if (error)
+ goto exit5;
error = vfs_rename(old_dir->d_inode, old_dentry,
new_dir->d_inode, new_dentry);
+ mnt_drop_write(oldnd.path.mnt);
exit5:
dput(new_dentry);
exit4:
diff --git a/fs/namespace.c b/fs/namespace.c
index 94f026ec990..0505fb61aa7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
#include <linux/quotaops.h>
#include <linux/acct.h>
#include <linux/capability.h>
+#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/sysfs.h>
#include <linux/seq_file.h>
@@ -26,6 +27,7 @@
#include <linux/mount.h>
#include <linux/ramfs.h>
#include <linux/log2.h>
+#include <linux/idr.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include "pnode.h"
@@ -38,6 +40,8 @@
__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
static int event;
+static DEFINE_IDA(mnt_id_ida);
+static DEFINE_IDA(mnt_group_ida);
static struct list_head *mount_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
@@ -55,10 +59,65 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
return tmp & (HASH_SIZE - 1);
}
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+
+/* allocation is serialized by namespace_sem */
+static int mnt_alloc_id(struct vfsmount *mnt)
+{
+ int res;
+
+retry:
+ ida_pre_get(&mnt_id_ida, GFP_KERNEL);
+ spin_lock(&vfsmount_lock);
+ res = ida_get_new(&mnt_id_ida, &mnt->mnt_id);
+ spin_unlock(&vfsmount_lock);
+ if (res == -EAGAIN)
+ goto retry;
+
+ return res;
+}
+
+static void mnt_free_id(struct vfsmount *mnt)
+{
+ spin_lock(&vfsmount_lock);
+ ida_remove(&mnt_id_ida, mnt->mnt_id);
+ spin_unlock(&vfsmount_lock);
+}
+
+/*
+ * Allocate a new peer group ID
+ *
+ * mnt_group_ida is protected by namespace_sem
+ */
+static int mnt_alloc_group_id(struct vfsmount *mnt)
+{
+ if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
+ return -ENOMEM;
+
+ return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id);
+}
+
+/*
+ * Release a peer group ID
+ */
+void mnt_release_group_id(struct vfsmount *mnt)
+{
+ ida_remove(&mnt_group_ida, mnt->mnt_group_id);
+ mnt->mnt_group_id = 0;
+}
+
struct vfsmount *alloc_vfsmnt(const char *name)
{
struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
if (mnt) {
+ int err;
+
+ err = mnt_alloc_id(mnt);
+ if (err) {
+ kmem_cache_free(mnt_cache, mnt);
+ return NULL;
+ }
+
atomic_set(&mnt->mnt_count, 1);
INIT_LIST_HEAD(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
@@ -68,6 +127,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
+ atomic_set(&mnt->__mnt_writers, 0);
if (name) {
int size = strlen(name) + 1;
char *newname = kmalloc(size, GFP_KERNEL);
@@ -80,6 +140,263 @@ struct vfsmount *alloc_vfsmnt(const char *name)
return mnt;
}
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*. This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ * mnt_want/drop_write() will _keep_ the filesystem
+ * r/w.
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+ if (mnt->mnt_flags & MNT_READONLY)
+ return 1;
+ if (mnt->mnt_sb->s_flags & MS_RDONLY)
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+
+struct mnt_writer {
+ /*
+ * If holding multiple instances of this lock, they
+ * must be ordered by cpu number.
+ */
+ spinlock_t lock;
+ struct lock_class_key lock_class; /* compiles out with !lockdep */
+ unsigned long count;
+ struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+
+static int __init init_mnt_writers(void)
+{
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+ spin_lock_init(&writer->lock);
+ lockdep_set_class(&writer->lock, &writer->lock_class);
+ writer->count = 0;
+ }
+ return 0;
+}
+fs_initcall(init_mnt_writers);
+
+static void unlock_mnt_writers(void)
+{
+ int cpu;
+ struct mnt_writer *cpu_writer;
+
+ for_each_possible_cpu(cpu) {
+ cpu_writer = &per_cpu(mnt_writers, cpu);
+ spin_unlock(&cpu_writer->lock);
+ }
+}
+
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+ if (!cpu_writer->mnt)
+ return;
+ /*
+ * This is in case anyone ever leaves an invalid,
+ * old ->mnt and a count of 0.
+ */
+ if (!cpu_writer->count)
+ return;
+ atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+ cpu_writer->count = 0;
+}
+ /*
+ * must hold cpu_writer->lock
+ */
+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+ struct vfsmount *mnt)
+{
+ if (cpu_writer->mnt == mnt)
+ return;
+ __clear_mnt_count(cpu_writer);
+ cpu_writer->mnt = mnt;
+}
+
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/**
+ * mnt_want_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is
+ * about to be performed to it, and makes sure that
+ * writes are allowed before returning success. When
+ * the write operation is finished, mnt_drop_write()
+ * must be called. This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *mnt)
+{
+ int ret = 0;
+ struct mnt_writer *cpu_writer;
+
+ cpu_writer = &get_cpu_var(mnt_writers);
+ spin_lock(&cpu_writer->lock);
+ if (__mnt_is_readonly(mnt)) {
+ ret = -EROFS;
+ goto out;
+ }
+ use_cpu_writer_for_mount(cpu_writer, mnt);
+ cpu_writer->count++;
+out:
+ spin_unlock(&cpu_writer->lock);
+ put_cpu_var(mnt_writers);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+static void lock_mnt_writers(void)
+{
+ int cpu;
+ struct mnt_writer *cpu_writer;
+
+ for_each_possible_cpu(cpu) {
+ cpu_writer = &per_cpu(mnt_writers, cpu);
+ spin_lock(&cpu_writer->lock);
+ __clear_mnt_count(cpu_writer);
+ cpu_writer->mnt = NULL;
+ }
+}
+
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count. Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+ if (atomic_read(&mnt->__mnt_writers) >=
+ MNT_WRITER_UNDERFLOW_LIMIT)
+ return;
+ /*
+ * It isn't necessary to hold all of the locks
+ * at the same time, but doing it this way makes
+ * us share a lot more code.
+ */
+ lock_mnt_writers();
+ /*
+ * vfsmount_lock is for mnt_flags.
+ */
+ spin_lock(&vfsmount_lock);
+ /*
+ * If coalescing the per-cpu writer counts did not
+ * get us back to a positive writer count, we have
+ * a bug.
+ */
+ if ((atomic_read(&mnt->__mnt_writers) < 0) &&
+ !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
+ printk(KERN_DEBUG "leak detected on mount(%p) writers "
+ "count: %d\n",
+ mnt, atomic_read(&mnt->__mnt_writers));
+ WARN_ON(1);
+ /* use the flag to keep the dmesg spam down */
+ mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
+ }
+ spin_unlock(&vfsmount_lock);
+ unlock_mnt_writers();
+}
+
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done
+ * performing writes to it. Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+ int must_check_underflow = 0;
+ struct mnt_writer *cpu_writer;
+
+ cpu_writer = &get_cpu_var(mnt_writers);
+ spin_lock(&cpu_writer->lock);
+
+ use_cpu_writer_for_mount(cpu_writer, mnt);
+ if (cpu_writer->count > 0) {
+ cpu_writer->count--;
+ } else {
+ must_check_underflow = 1;
+ atomic_dec(&mnt->__mnt_writers);
+ }
+
+ spin_unlock(&cpu_writer->lock);
+ /*
+ * Logically, we could call this each time,
+ * but the __mnt_writers cacheline tends to
+ * be cold, and makes this expensive.
+ */
+ if (must_check_underflow)
+ handle_write_count_underflow(mnt);
+ /*
+ * This could be done right after the spinlock
+ * is taken because the spinlock keeps us on
+ * the cpu, and disables preemption. However,
+ * putting it here bounds the amount that
+ * __mnt_writers can underflow. Without it,
+ * we could theoretically wrap __mnt_writers.
+ */
+ put_cpu_var(mnt_writers);
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
+static int mnt_make_readonly(struct vfsmount *mnt)
+{
+ int ret = 0;
+
+ lock_mnt_writers();
+ /*
+ * With all the locks held, this value is stable
+ */
+ if (atomic_read(&mnt->__mnt_writers) > 0) {
+ ret = -EBUSY;
+ goto out;
+ }
+ /*
+ * nobody can do a successful mnt_want_write() with all
+ * of the counts in MNT_DENIED_WRITE and the locks held.
+ */
+ spin_lock(&vfsmount_lock);
+ if (!ret)
+ mnt->mnt_flags |= MNT_READONLY;
+ spin_unlock(&vfsmount_lock);
+out:
+ unlock_mnt_writers();
+ return ret;
+}
+
+static void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+ spin_lock(&vfsmount_lock);
+ mnt->mnt_flags &= ~MNT_READONLY;
+ spin_unlock(&vfsmount_lock);
+}
+
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
@@ -92,6 +409,7 @@ EXPORT_SYMBOL(simple_set_mnt);
void free_vfsmnt(struct vfsmount *mnt)
{
kfree(mnt->mnt_devname);
+ mnt_free_id(mnt);
kmem_cache_free(mnt_cache, mnt);
}
@@ -238,6 +556,17 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
if (mnt) {
+ if (flag & (CL_SLAVE | CL_PRIVATE))
+ mnt->mnt_group_id = 0; /* not a peer of original */
+ else
+ mnt->mnt_group_id = old->mnt_group_id;
+
+ if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
+ int err = mnt_alloc_group_id(mnt);
+ if (err)
+ goto out_free;
+ }
+
mnt->mnt_flags = old->mnt_flags;
atomic_inc(&sb->s_active);
mnt->mnt_sb = sb;
@@ -267,11 +596,44 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
}
}
return mnt;
+
+ out_free:
+ free_vfsmnt(mnt);
+ return NULL;
}
static inline void __mntput(struct vfsmount *mnt)
{
+ int cpu;
struct super_block *sb = mnt->mnt_sb;
+ /*
+ * We don't have to hold all of the locks at the
+ * same time here because we know that we're the
+ * last reference to mnt and that no new writers
+ * can come in.
+ */
+ for_each_possible_cpu(cpu) {
+ struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
+ if (cpu_writer->mnt != mnt)
+ continue;
+ spin_lock(&cpu_writer->lock);
+ atomic_add(cpu_writer->count, &mnt->__mnt_writers);
+ cpu_writer->count = 0;
+ /*
+ * Might as well do this so that no one
+ * ever sees the pointer and expects
+ * it to be valid.
+ */
+ cpu_writer->mnt = NULL;
+ spin_unlock(&cpu_writer->lock);
+ }
+ /*
+ * This probably indicates that somebody messed
+ * up a mnt_want/drop_write() pair. If this
+ * happens, the filesystem was probably unable
+ * to make r/w->r/o transitions.
+ */
+ WARN_ON(atomic_read(&mnt->__mnt_writers));
dput(mnt->mnt_root);
free_vfsmnt(mnt);
deactivate_super(sb);
@@ -362,20 +724,21 @@ void save_mount_options(struct super_block *sb, char *options)
}
EXPORT_SYMBOL(save_mount_options);
+#ifdef CONFIG_PROC_FS
/* iterator */
static void *m_start(struct seq_file *m, loff_t *pos)
{
- struct mnt_namespace *n = m->private;
+ struct proc_mounts *p = m->private;
down_read(&namespace_sem);
- return seq_list_start(&n->list, *pos);
+ return seq_list_start(&p->ns->list, *pos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct mnt_namespace *n = m->private;
+ struct proc_mounts *p = m->private;
- return seq_list_next(v, &n->list, pos);
+ return seq_list_next(v, &p->ns->list, pos);
}
static void m_stop(struct seq_file *m, void *v)
@@ -383,20 +746,30 @@ static void m_stop(struct seq_file *m, void *v)
up_read(&namespace_sem);
}
-static int show_vfsmnt(struct seq_file *m, void *v)
+struct proc_fs_info {
+ int flag;
+ const char *str;
+};
+
+static void show_sb_opts(struct seq_file *m, struct super_block *sb)
{
- struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
- int err = 0;
- static struct proc_fs_info {
- int flag;
- char *str;
- } fs_info[] = {
+ static const struct proc_fs_info fs_info[] = {
{ MS_SYNCHRONOUS, ",sync" },
{ MS_DIRSYNC, ",dirsync" },
{ MS_MANDLOCK, ",mand" },
{ 0, NULL }
};
- static struct proc_fs_info mnt_info[] = {
+ const struct proc_fs_info *fs_infop;
+
+ for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
+ if (sb->s_flags & fs_infop->flag)
+ seq_puts(m, fs_infop->str);
+ }
+}
+
+static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
+{
+ static const struct proc_fs_info mnt_info[] = {
{ MNT_NOSUID, ",nosuid" },
{ MNT_NODEV, ",nodev" },
{ MNT_NOEXEC, ",noexec" },
@@ -405,40 +778,108 @@ static int show_vfsmnt(struct seq_file *m, void *v)
{ MNT_RELATIME, ",relatime" },
{ 0, NULL }
};
- struct proc_fs_info *fs_infop;
+ const struct proc_fs_info *fs_infop;
+
+ for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
+ if (mnt->mnt_flags & fs_infop->flag)
+ seq_puts(m, fs_infop->str);
+ }
+}
+
+static void show_type(struct seq_file *m, struct super_block *sb)
+{
+ mangle(m, sb->s_type->name);
+ if (sb->s_subtype && sb->s_subtype[0]) {
+ seq_putc(m, '.');
+ mangle(m, sb->s_subtype);
+ }
+}
+
+static int show_vfsmnt(struct seq_file *m, void *v)
+{
+ struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
+ int err = 0;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
seq_putc(m, ' ');
seq_path(m, &mnt_path, " \t\n\\");
seq_putc(m, ' ');
- mangle(m, mnt->mnt_sb->s_type->name);
- if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) {
- seq_putc(m, '.');
- mangle(m, mnt->mnt_sb->s_subtype);
- }
- seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
- for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
- if (mnt->mnt_sb->s_flags & fs_infop->flag)
- seq_puts(m, fs_infop->str);
- }
- for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
- if (mnt->mnt_flags & fs_infop->flag)
- seq_puts(m, fs_infop->str);
- }
+ show_type(m, mnt->mnt_sb);
+ seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
+ show_sb_opts(m, mnt->mnt_sb);
+ show_mnt_opts(m, mnt);
if (mnt->mnt_sb->s_op->show_options)
err = mnt->mnt_sb->s_op->show_options(m, mnt);
seq_puts(m, " 0 0\n");
return err;
}
-struct seq_operations mounts_op = {
+const struct seq_operations mounts_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_vfsmnt
};
+static int show_mountinfo(struct seq_file *m, void *v)
+{
+ struct proc_mounts *p = m->private;
+ struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
+ struct super_block *sb = mnt->mnt_sb;
+ struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+ struct path root = p->root;
+ int err = 0;
+
+ seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
+ MAJOR(sb->s_dev), MINOR(sb->s_dev));
+ seq_dentry(m, mnt->mnt_root, " \t\n\\");
+ seq_putc(m, ' ');
+ seq_path_root(m, &mnt_path, &root, " \t\n\\");
+ if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
+ /*
+ * Mountpoint is outside root, discard that one. Ugly,
+ * but less so than trying to do that in iterator in a
+ * race-free way (due to renames).
+ */
+ return SEQ_SKIP;
+ }
+ seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
+ show_mnt_opts(m, mnt);
+
+ /* Tagged fields ("foo:X" or "bar") */
+ if (IS_MNT_SHARED(mnt))
+ seq_printf(m, " shared:%i", mnt->mnt_group_id);
+ if (IS_MNT_SLAVE(mnt)) {
+ int master = mnt->mnt_master->mnt_group_id;
+ int dom = get_dominating_id(mnt, &p->root);
+ seq_printf(m, " master:%i", master);
+ if (dom && dom != master)
+ seq_printf(m, " propagate_from:%i", dom);
+ }
+ if (IS_MNT_UNBINDABLE(mnt))
+ seq_puts(m, " unbindable");
+
+ /* Filesystem specific data */
+ seq_puts(m, " - ");
+ show_type(m, sb);
+ seq_putc(m, ' ');
+ mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+ seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
+ show_sb_opts(m, sb);
+ if (sb->s_op->show_options)
+ err = sb->s_op->show_options(m, mnt);
+ seq_putc(m, '\n');
+ return err;
+}
+
+const struct seq_operations mountinfo_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_mountinfo,
+};
+
static int show_vfsstat(struct seq_file *m, void *v)
{
struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
@@ -459,7 +900,7 @@ static int show_vfsstat(struct seq_file *m, void *v)
/* file system type */
seq_puts(m, "with fstype ");
- mangle(m, mnt->mnt_sb->s_type->name);
+ show_type(m, mnt->mnt_sb);
/* optional statistics */
if (mnt->mnt_sb->s_op->show_stats) {
@@ -471,12 +912,13 @@ static int show_vfsstat(struct seq_file *m, void *v)
return err;
}
-struct seq_operations mountstats_op = {
+const struct seq_operations mountstats_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_vfsstat,
};
+#endif /* CONFIG_PROC_FS */
/**
* may_umount_tree - check if a mount tree is busy
@@ -801,23 +1243,50 @@ Enomem:
struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
{
struct vfsmount *tree;
- down_read(&namespace_sem);
+ down_write(&namespace_sem);
tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
- up_read(&namespace_sem);
+ up_write(&namespace_sem);
return tree;
}
void drop_collected_mounts(struct vfsmount *mnt)
{
LIST_HEAD(umount_list);
- down_read(&namespace_sem);
+ down_write(&namespace_sem);
spin_lock(&vfsmount_lock);
umount_tree(mnt, 0, &umount_list);
spin_unlock(&vfsmount_lock);
- up_read(&namespace_sem);
+ up_write(&namespace_sem);
release_mounts(&umount_list);
}
+static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
+{
+ struct vfsmount *p;
+
+ for (p = mnt; p != end; p = next_mnt(p, mnt)) {
+ if (p->mnt_group_id && !IS_MNT_SHARED(p))
+ mnt_release_group_id(p);
+ }
+}
+
+static int invent_group_ids(struct vfsmount *mnt, bool recurse)
+{
+ struct vfsmount *p;
+
+ for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
+ if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
+ int err = mnt_alloc_group_id(p);
+ if (err) {
+ cleanup_group_ids(mnt, p);
+ return err;
+ }
+ }
+ }
+
+ return 0;
+}
+
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
@@ -888,9 +1357,16 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
struct vfsmount *dest_mnt = path->mnt;
struct dentry *dest_dentry = path->dentry;
struct vfsmount *child, *p;
+ int err;
- if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
- return -EINVAL;
+ if (IS_MNT_SHARED(dest_mnt)) {
+ err = invent_group_ids(source_mnt, true);
+ if (err)
+ goto out;
+ }
+ err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
+ if (err)
+ goto out_cleanup_ids;
if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -913,34 +1389,40 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
}
spin_unlock(&vfsmount_lock);
return 0;
+
+ out_cleanup_ids:
+ if (IS_MNT_SHARED(dest_mnt))
+ cleanup_group_ids(source_mnt, NULL);
+ out:
+ return err;
}
-static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
+static int graft_tree(struct vfsmount *mnt, struct path *path)
{
int err;
if (mnt->mnt_sb->s_flags & MS_NOUSER)
return -EINVAL;
- if (S_ISDIR(nd->path.dentry->d_inode->i_mode) !=
+ if (S_ISDIR(path->dentry->d_inode->i_mode) !=
S_ISDIR(mnt->mnt_root->d_inode->i_mode))
return -ENOTDIR;
err = -ENOENT;
- mutex_lock(&nd->path.dentry->d_inode->i_mutex);
- if (IS_DEADDIR(nd->path.dentry->d_inode))
+ mutex_lock(&path->dentry->d_inode->i_mutex);
+ if (IS_DEADDIR(path->dentry->d_inode))
goto out_unlock;
- err = security_sb_check_sb(mnt, nd);
+ err = security_sb_check_sb(mnt, path);
if (err)
goto out_unlock;
err = -ENOENT;
- if (IS_ROOT(nd->path.dentry) || !d_unhashed(nd->path.dentry))
- err = attach_recursive_mnt(mnt, &nd->path, NULL);
+ if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
+ err = attach_recursive_mnt(mnt, path, NULL);
out_unlock:
- mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
+ mutex_unlock(&path->dentry->d_inode->i_mutex);
if (!err)
- security_sb_post_addmount(mnt, nd);
+ security_sb_post_addmount(mnt, path);
return err;
}
@@ -953,6 +1435,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
struct vfsmount *m, *mnt = nd->path.mnt;
int recurse = flag & MS_REC;
int type = flag & ~MS_REC;
+ int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -961,12 +1444,20 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
return -EINVAL;
down_write(&namespace_sem);
+ if (type == MS_SHARED) {
+ err = invent_group_ids(mnt, recurse);
+ if (err)
+ goto out_unlock;
+ }
+
spin_lock(&vfsmount_lock);
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
spin_unlock(&vfsmount_lock);
+
+ out_unlock:
up_write(&namespace_sem);
- return 0;
+ return err;
}
/*
@@ -1004,7 +1495,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name,
if (!mnt)
goto out;
- err = graft_tree(mnt, nd);
+ err = graft_tree(mnt, &nd->path);
if (err) {
LIST_HEAD(umount_list);
spin_lock(&vfsmount_lock);
@@ -1019,6 +1510,23 @@ out:
return err;
}
+static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+{
+ int error = 0;
+ int readonly_request = 0;
+
+ if (ms_flags & MS_RDONLY)
+ readonly_request = 1;
+ if (readonly_request == __mnt_is_readonly(mnt))
+ return 0;
+
+ if (readonly_request)
+ error = mnt_make_readonly(mnt);
+ else
+ __mnt_unmake_readonly(mnt);
+ return error;
+}
+
/*
* change filesystem flags. dir should be a physical root of filesystem.
* If you've mounted a non-root directory somewhere and want to do remount
@@ -1041,7 +1549,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
return -EINVAL;
down_write(&sb->s_umount);
- err = do_remount_sb(sb, flags, data, 0);
+ if (flags & MS_BIND)
+ err = change_mount_flags(nd->path.mnt, flags);
+ else
+ err = do_remount_sb(sb, flags, data, 0);
if (!err)
nd->path.mnt->mnt_flags = mnt_flags;
up_write(&sb->s_umount);
@@ -1191,7 +1702,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
goto unlock;
newmnt->mnt_flags = mnt_flags;
- if ((err = graft_tree(newmnt, nd)))
+ if ((err = graft_tree(newmnt, &nd->path)))
goto unlock;
if (fslist) /* add to the specified expiration list */
@@ -1425,6 +1936,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_RELATIME)
mnt_flags |= MNT_RELATIME;
+ if (flags & MS_RDONLY)
+ mnt_flags |= MNT_READONLY;
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
@@ -1434,7 +1947,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
if (retval)
return retval;
- retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
+ retval = security_sb_mount(dev_name, &nd.path,
+ type_page, flags, data_page);
if (retval)
goto dput_out;
@@ -1674,15 +2188,13 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
const char __user * put_old)
{
struct vfsmount *tmp;
- struct nameidata new_nd, old_nd, user_nd;
- struct path parent_path, root_parent;
+ struct nameidata new_nd, old_nd;
+ struct path parent_path, root_parent, root;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- lock_kernel();
-
error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
&new_nd);
if (error)
@@ -1695,14 +2207,14 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
if (error)
goto out1;
- error = security_sb_pivotroot(&old_nd, &new_nd);
+ error = security_sb_pivotroot(&old_nd.path, &new_nd.path);
if (error) {
path_put(&old_nd.path);
goto out1;
}
read_lock(&current->fs->lock);
- user_nd.path = current->fs->root;
+ root = current->fs->root;
path_get(&current->fs->root);
read_unlock(&current->fs->lock);
down_write(&namespace_sem);
@@ -1710,9 +2222,9 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
error = -EINVAL;
if (IS_MNT_SHARED(old_nd.path.mnt) ||
IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) ||
- IS_MNT_SHARED(user_nd.path.mnt->mnt_parent))
+ IS_MNT_SHARED(root.mnt->mnt_parent))
goto out2;
- if (!check_mnt(user_nd.path.mnt))
+ if (!check_mnt(root.mnt))
goto out2;
error = -ENOENT;
if (IS_DEADDIR(new_nd.path.dentry->d_inode))
@@ -1722,13 +2234,13 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry))
goto out2;
error = -EBUSY;
- if (new_nd.path.mnt == user_nd.path.mnt ||
- old_nd.path.mnt == user_nd.path.mnt)
+ if (new_nd.path.mnt == root.mnt ||
+ old_nd.path.mnt == root.mnt)
goto out2; /* loop, on the same file system */
error = -EINVAL;
- if (user_nd.path.mnt->mnt_root != user_nd.path.dentry)
+ if (root.mnt->mnt_root != root.dentry)
goto out2; /* not a mountpoint */
- if (user_nd.path.mnt->mnt_parent == user_nd.path.mnt)
+ if (root.mnt->mnt_parent == root.mnt)
goto out2; /* not attached */
if (new_nd.path.mnt->mnt_root != new_nd.path.dentry)
goto out2; /* not a mountpoint */
@@ -1750,27 +2262,26 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
} else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry))
goto out3;
detach_mnt(new_nd.path.mnt, &parent_path);
- detach_mnt(user_nd.path.mnt, &root_parent);
+ detach_mnt(root.mnt, &root_parent);
/* mount old root on put_old */
- attach_mnt(user_nd.path.mnt, &old_nd.path);
+ attach_mnt(root.mnt, &old_nd.path);
/* mount new_root on / */
attach_mnt(new_nd.path.mnt, &root_parent);
touch_mnt_namespace(current->nsproxy->mnt_ns);
spin_unlock(&vfsmount_lock);
- chroot_fs_refs(&user_nd.path, &new_nd.path);
- security_sb_post_pivotroot(&user_nd, &new_nd);
+ chroot_fs_refs(&root, &new_nd.path);
+ security_sb_post_pivotroot(&root, &new_nd.path);
error = 0;
path_put(&root_parent);
path_put(&parent_path);
out2:
mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex);
up_write(&namespace_sem);
- path_put(&user_nd.path);
+ path_put(&root);
path_put(&old_nd.path);
out1:
path_put(&new_nd.path);
out0:
- unlock_kernel();
return error;
out3:
spin_unlock(&vfsmount_lock);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c67b4bdcf71..ad8f167e54b 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/ioctl.h>
#include <linux/time.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/highuid.h>
#include <linux/smp_lock.h>
#include <linux/vmalloc.h>
@@ -261,7 +262,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
}
#endif /* CONFIG_NCPFS_NLS */
-int ncp_ioctl(struct inode *inode, struct file *filp,
+static int __ncp_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct ncp_server *server = NCP_SERVER(inode);
@@ -822,6 +823,57 @@ outrel:
return -EINVAL;
}
+static int ncp_ioctl_need_write(unsigned int cmd)
+{
+ switch (cmd) {
+ case NCP_IOC_GET_FS_INFO:
+ case NCP_IOC_GET_FS_INFO_V2:
+ case NCP_IOC_NCPREQUEST:
+ case NCP_IOC_SETDENTRYTTL:
+ case NCP_IOC_SIGN_INIT:
+ case NCP_IOC_LOCKUNLOCK:
+ case NCP_IOC_SET_SIGN_WANTED:
+ return 1;
+ case NCP_IOC_GETOBJECTNAME:
+ case NCP_IOC_SETOBJECTNAME:
+ case NCP_IOC_GETPRIVATEDATA:
+ case NCP_IOC_SETPRIVATEDATA:
+ case NCP_IOC_SETCHARSETS:
+ case NCP_IOC_GETCHARSETS:
+ case NCP_IOC_CONN_LOGGED_IN:
+ case NCP_IOC_GETDENTRYTTL:
+ case NCP_IOC_GETMOUNTUID2:
+ case NCP_IOC_SIGN_WANTED:
+ case NCP_IOC_GETROOT:
+ case NCP_IOC_SETROOT:
+ return 0;
+ default:
+ /* unkown IOCTL command, assume write */
+ return 1;
+ }
+}
+
+int ncp_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ if (ncp_ioctl_need_write(cmd)) {
+ /*
+ * inside the ioctl(), any failures which
+ * are because of file_permission() are
+ * -EACCESS, so it seems consistent to keep
+ * that here.
+ */
+ if (mnt_want_write(filp->f_path.mnt))
+ return -EACCES;
+ }
+ ret = __ncp_ioctl(inode, filp, cmd, arg);
+ if (ncp_ioctl_need_write(cmd))
+ mnt_drop_write(filp->f_path.mnt);
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6cea7479c5b..d9e30ac2798 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -967,7 +967,8 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd)
if (nd->flags & LOOKUP_DIRECTORY)
return 0;
/* Are we trying to write to a read only partition? */
- if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
+ if (__mnt_is_readonly(nd->path.mnt) &&
+ (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
return 0;
return 1;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c593db047d8..c309c881bd4 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -658,14 +658,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
}
+ status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt);
+ if (status)
+ return status;
status = nfs_ok;
if (setattr->sa_acl != NULL)
status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
setattr->sa_acl);
if (status)
- return status;
+ goto out;
status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
0, (time_t)0);
+out:
+ mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt);
return status;
}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1ff90625860..145b3c877a2 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -46,6 +46,7 @@
#include <linux/scatterlist.h>
#include <linux/crypto.h>
#include <linux/sched.h>
+#include <linux/mount.h>
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -154,7 +155,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
goto out_put;
}
+ status = mnt_want_write(rec_dir.path.mnt);
+ if (status)
+ goto out_put;
status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU);
+ mnt_drop_write(rec_dir.path.mnt);
out_put:
dput(dentry);
out_unlock:
@@ -313,12 +318,17 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
if (!rec_dir_init || !clp->cl_firststate)
return;
+ status = mnt_want_write(rec_dir.path.mnt);
+ if (status)
+ goto out;
clp->cl_firststate = 0;
nfs4_save_user(&uid, &gid);
status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
nfs4_reset_user(uid, gid);
if (status == 0)
nfsd4_sync_rec_dir();
+ mnt_drop_write(rec_dir.path.mnt);
+out:
if (status)
printk("NFSD: Failed to remove expired client state directory"
" %.*s\n", HEXDIR_LEN, clp->cl_recdir);
@@ -347,13 +357,17 @@ nfsd4_recdir_purge_old(void) {
if (!rec_dir_init)
return;
+ status = mnt_want_write(rec_dir.path.mnt);
+ if (status)
+ goto out;
status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old);
if (status == 0)
nfsd4_sync_rec_dir();
+ mnt_drop_write(rec_dir.path.mnt);
+out:
if (status)
printk("nfsd4: failed to purge old clients from recovery"
" directory %s\n", rec_dir.path.dentry->d_name.name);
- return;
}
static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bcb97d8e8b8..81a75f3081f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
#include <linux/sunrpc/svc.h>
#include <linux/nfsd/nfsd.h>
#include <linux/nfsd/cache.h>
+#include <linux/file.h>
#include <linux/mount.h>
#include <linux/workqueue.h>
#include <linux/smp_lock.h>
@@ -1239,7 +1240,7 @@ static inline void
nfs4_file_downgrade(struct file *filp, unsigned int share_access)
{
if (share_access & NFS4_SHARE_ACCESS_WRITE) {
- put_write_access(filp->f_path.dentry->d_inode);
+ drop_file_write_access(filp);
filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
}
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 46f59d5365a..304bf5f643c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1255,23 +1255,35 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = 0;
switch (type) {
case S_IFREG:
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
break;
case S_IFDIR:
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
break;
default:
printk("nfsd: bad file type %o in nfsd_create\n", type);
host_err = -EINVAL;
+ goto out_nfserr;
}
- if (host_err < 0)
+ if (host_err < 0) {
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
goto out_nfserr;
+ }
if (EX_ISSYNC(fhp->fh_export)) {
err = nfserrno(nfsd_sync_dir(dentry));
@@ -1282,6 +1294,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err2 = nfsd_create_setattr(rqstp, resfhp, iap);
if (err2)
err = err2;
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
/*
* Update the file handle to get the new inode info.
*/
@@ -1359,6 +1372,9 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
v_atime = verifier[1]&0x7fffffff;
}
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
if (dchild->d_inode) {
err = 0;
@@ -1390,12 +1406,15 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
}
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
goto out;
}
host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
- if (host_err < 0)
+ if (host_err < 0) {
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
goto out_nfserr;
+ }
if (created)
*created = 1;
@@ -1420,6 +1439,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err2)
err = err2;
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
/*
* Update the filehandle to get the new inode info.
*/
@@ -1522,6 +1542,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (iap && (iap->ia_valid & ATTR_MODE))
mode = iap->ia_mode & S_IALLUGO;
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
+
if (unlikely(path[plen] != 0)) {
char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
if (path_alloced == NULL)
@@ -1542,6 +1566,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserrno(host_err);
fh_unlock(fhp);
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
+
cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
dput(dnew);
if (err==0) err = cerr;
@@ -1592,6 +1618,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
dold = tfhp->fh_dentry;
dest = dold->d_inode;
+ host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
+ if (host_err) {
+ err = nfserrno(host_err);
+ goto out_dput;
+ }
host_err = vfs_link(dold, dirp, dnew);
if (!host_err) {
if (EX_ISSYNC(ffhp->fh_export)) {
@@ -1605,7 +1636,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
else
err = nfserrno(host_err);
}
-
+ mnt_drop_write(tfhp->fh_export->ex_path.mnt);
+out_dput:
dput(dnew);
out_unlock:
fh_unlock(ffhp);
@@ -1678,13 +1710,20 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (ndentry == trap)
goto out_dput_new;
-#ifdef MSNFS
- if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+ if (svc_msnfs(ffhp) &&
((atomic_read(&odentry->d_count) > 1)
|| (atomic_read(&ndentry->d_count) > 1))) {
host_err = -EPERM;
- } else
-#endif
+ goto out_dput_new;
+ }
+
+ host_err = -EXDEV;
+ if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+ goto out_dput_new;
+ host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_dput_new;
+
host_err = vfs_rename(fdir, odentry, tdir, ndentry);
if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
host_err = nfsd_sync_dir(tdentry);
@@ -1692,6 +1731,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
host_err = nfsd_sync_dir(fdentry);
}
+ mnt_drop_write(ffhp->fh_export->ex_path.mnt);
+
out_dput_new:
dput(ndentry);
out_dput_old:
@@ -1750,6 +1791,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!type)
type = rdentry->d_inode->i_mode & S_IFMT;
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
+
if (type != S_IFDIR) { /* It's UNLINK */
#ifdef MSNFS
if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
@@ -1765,10 +1810,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
dput(rdentry);
if (host_err)
- goto out_nfserr;
+ goto out_drop;
if (EX_ISSYNC(fhp->fh_export))
host_err = nfsd_sync_dir(dentry);
+out_drop:
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
out_nfserr:
err = nfserrno(host_err);
out:
@@ -1865,7 +1912,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
inode->i_mode,
IS_IMMUTABLE(inode)? " immut" : "",
IS_APPEND(inode)? " append" : "",
- IS_RDONLY(inode)? " ro" : "");
+ __mnt_is_readonly(exp->ex_path.mnt)? " ro" : "");
dprintk(" owner %d/%d user %d/%d\n",
inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);
#endif
@@ -1876,7 +1923,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
*/
if (!(acc & MAY_LOCAL_ACCESS))
if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
- if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode))
+ if (exp_rdonly(rqstp, exp) ||
+ __mnt_is_readonly(exp->ex_path.mnt))
return nfserr_rofs;
if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
return nfserr_perm;
@@ -2039,6 +2087,9 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
} else
size = 0;
+ error = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (error)
+ goto getout;
if (size)
error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
else {
@@ -2050,6 +2101,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
error = 0;
}
}
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
getout:
kfree(value);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 4d4ce48bb42..f6956de56fd 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -2,7 +2,12 @@ EXTRA_CFLAGS += -Ifs/ocfs2
EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
-obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+obj-$(CONFIG_OCFS2_FS) += \
+ ocfs2.o \
+ ocfs2_stackglue.o
+
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
+obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
ocfs2-objs := \
alloc.o \
@@ -31,5 +36,10 @@ ocfs2-objs := \
uptodate.o \
ver.o
+ocfs2_stackglue-objs := stackglue.o
+ocfs2_stack_o2cb-objs := stack_o2cb.o
+ocfs2_stack_user-objs := stack_user.o
+
+# cluster/ is always needed when OCFS2_FS for masklog support
obj-$(CONFIG_OCFS2_FS) += cluster/
-obj-$(CONFIG_OCFS2_FS) += dlm/
+obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 447206eb5c2..41f84c92094 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
BUG_ON(!next_free);
/* The tree code before us didn't allow enough room in the leaf. */
- if (el->l_next_free_rec == el->l_count && !has_empty)
- BUG();
+ BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
/*
* The easiest way to approach this is to just remove the
@@ -1450,6 +1449,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
* - When our insert into the right path leaf is at the leftmost edge
* and requires an update of the path immediately to it's left. This
* can occur at the end of some types of rotation and appending inserts.
+ * - When we've adjusted the last extent record in the left path leaf and the
+ * 1st extent record in the right path leaf during cross extent block merge.
*/
static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
struct ocfs2_path *left_path,
@@ -2712,24 +2713,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
}
}
+static int ocfs2_get_right_path(struct inode *inode,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path **ret_right_path)
+{
+ int ret;
+ u32 right_cpos;
+ struct ocfs2_path *right_path = NULL;
+ struct ocfs2_extent_list *left_el;
+
+ *ret_right_path = NULL;
+
+ /* This function shouldn't be called for non-trees. */
+ BUG_ON(left_path->p_tree_depth == 0);
+
+ left_el = path_leaf_el(left_path);
+ BUG_ON(left_el->l_next_free_rec != left_el->l_count);
+
+ ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+ &right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* This function shouldn't be called for the rightmost leaf. */
+ BUG_ON(right_cpos == 0);
+
+ right_path = ocfs2_new_path(path_root_bh(left_path),
+ path_root_el(left_path));
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(inode, right_path, right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *ret_right_path = right_path;
+out:
+ if (ret)
+ ocfs2_free_path(right_path);
+ return ret;
+}
+
/*
* Remove split_rec clusters from the record at index and merge them
- * onto the beginning of the record at index + 1.
+ * onto the beginning of the record "next" to it.
+ * For index < l_count - 1, the next means the extent rec at index + 1.
+ * For index == l_count - 1, the "next" means the 1st extent rec of the
+ * next extent block.
*/
-static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
- handle_t *handle,
- struct ocfs2_extent_rec *split_rec,
- struct ocfs2_extent_list *el, int index)
+static int ocfs2_merge_rec_right(struct inode *inode,
+ struct ocfs2_path *left_path,
+ handle_t *handle,
+ struct ocfs2_extent_rec *split_rec,
+ int index)
{
- int ret;
+ int ret, next_free, i;
unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
struct ocfs2_extent_rec *left_rec;
struct ocfs2_extent_rec *right_rec;
+ struct ocfs2_extent_list *right_el;
+ struct ocfs2_path *right_path = NULL;
+ int subtree_index = 0;
+ struct ocfs2_extent_list *el = path_leaf_el(left_path);
+ struct buffer_head *bh = path_leaf_bh(left_path);
+ struct buffer_head *root_bh = NULL;
BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
-
left_rec = &el->l_recs[index];
- right_rec = &el->l_recs[index + 1];
+
+ if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
+ le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
+ /* we meet with a cross extent block merge. */
+ ret = ocfs2_get_right_path(inode, left_path, &right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ right_el = path_leaf_el(right_path);
+ next_free = le16_to_cpu(right_el->l_next_free_rec);
+ BUG_ON(next_free <= 0);
+ right_rec = &right_el->l_recs[0];
+ if (ocfs2_is_empty_extent(right_rec)) {
+ BUG_ON(le16_to_cpu(next_free) <= 1);
+ right_rec = &right_el->l_recs[1];
+ }
+
+ BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+ le16_to_cpu(left_rec->e_leaf_clusters) !=
+ le32_to_cpu(right_rec->e_cpos));
+
+ subtree_index = ocfs2_find_subtree_root(inode,
+ left_path, right_path);
+
+ ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ root_bh = left_path->p_node[subtree_index].bh;
+ BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = subtree_index + 1;
+ i < path_num_items(right_path); i++) {
+ ret = ocfs2_journal_access(handle, inode,
+ right_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode,
+ left_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ } else {
+ BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
+ right_rec = &el->l_recs[index + 1];
+ }
ret = ocfs2_journal_access(handle, inode, bh,
OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2751,30 +2875,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
if (ret)
mlog_errno(ret);
+ if (right_path) {
+ ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_complete_edge_insert(inode, handle, left_path,
+ right_path, subtree_index);
+ }
+out:
+ if (right_path)
+ ocfs2_free_path(right_path);
+ return ret;
+}
+
+static int ocfs2_get_left_path(struct inode *inode,
+ struct ocfs2_path *right_path,
+ struct ocfs2_path **ret_left_path)
+{
+ int ret;
+ u32 left_cpos;
+ struct ocfs2_path *left_path = NULL;
+
+ *ret_left_path = NULL;
+
+ /* This function shouldn't be called for non-trees. */
+ BUG_ON(right_path->p_tree_depth == 0);
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+ right_path, &left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* This function shouldn't be called for the leftmost leaf. */
+ BUG_ON(left_cpos == 0);
+
+ left_path = ocfs2_new_path(path_root_bh(right_path),
+ path_root_el(right_path));
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(inode, left_path, left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *ret_left_path = left_path;
out:
+ if (ret)
+ ocfs2_free_path(left_path);
return ret;
}
/*
* Remove split_rec clusters from the record at index and merge them
- * onto the tail of the record at index - 1.
+ * onto the tail of the record "before" it.
+ * For index > 0, the "before" means the extent rec at index - 1.
+ *
+ * For index == 0, the "before" means the last record of the previous
+ * extent block. And there is also a situation that we may need to
+ * remove the rightmost leaf extent block in the right_path and change
+ * the right path to indicate the new rightmost path.
*/
-static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+static int ocfs2_merge_rec_left(struct inode *inode,
+ struct ocfs2_path *right_path,
handle_t *handle,
struct ocfs2_extent_rec *split_rec,
- struct ocfs2_extent_list *el, int index)
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int index)
{
- int ret, has_empty_extent = 0;
+ int ret, i, subtree_index = 0, has_empty_extent = 0;
unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
struct ocfs2_extent_rec *left_rec;
struct ocfs2_extent_rec *right_rec;
+ struct ocfs2_extent_list *el = path_leaf_el(right_path);
+ struct buffer_head *bh = path_leaf_bh(right_path);
+ struct buffer_head *root_bh = NULL;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *left_el;
- BUG_ON(index <= 0);
+ BUG_ON(index < 0);
- left_rec = &el->l_recs[index - 1];
right_rec = &el->l_recs[index];
- if (ocfs2_is_empty_extent(&el->l_recs[0]))
- has_empty_extent = 1;
+ if (index == 0) {
+ /* we meet with a cross extent block merge. */
+ ret = ocfs2_get_left_path(inode, right_path, &left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ left_el = path_leaf_el(left_path);
+ BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
+ le16_to_cpu(left_el->l_count));
+
+ left_rec = &left_el->l_recs[
+ le16_to_cpu(left_el->l_next_free_rec) - 1];
+ BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+ le16_to_cpu(left_rec->e_leaf_clusters) !=
+ le32_to_cpu(split_rec->e_cpos));
+
+ subtree_index = ocfs2_find_subtree_root(inode,
+ left_path, right_path);
+
+ ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+ handle->h_buffer_credits,
+ left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ root_bh = left_path->p_node[subtree_index].bh;
+ BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = subtree_index + 1;
+ i < path_num_items(right_path); i++) {
+ ret = ocfs2_journal_access(handle, inode,
+ right_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode,
+ left_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ } else {
+ left_rec = &el->l_recs[index - 1];
+ if (ocfs2_is_empty_extent(&el->l_recs[0]))
+ has_empty_extent = 1;
+ }
ret = ocfs2_journal_access(handle, inode, bh,
OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2790,9 +3040,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
*left_rec = *split_rec;
has_empty_extent = 0;
- } else {
+ } else
le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
- }
le32_add_cpu(&right_rec->e_cpos, split_clusters);
le64_add_cpu(&right_rec->e_blkno,
@@ -2805,13 +3054,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
if (ret)
mlog_errno(ret);
+ if (left_path) {
+ ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+ if (ret)
+ mlog_errno(ret);
+
+ /*
+ * In the situation that the right_rec is empty and the extent
+ * block is empty also, ocfs2_complete_edge_insert can't handle
+ * it and we need to delete the right extent block.
+ */
+ if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
+ le16_to_cpu(el->l_next_free_rec) == 1) {
+
+ ret = ocfs2_remove_rightmost_path(inode, handle,
+ right_path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Now the rightmost extent block has been deleted.
+ * So we use the new rightmost path.
+ */
+ ocfs2_mv_path(right_path, left_path);
+ left_path = NULL;
+ } else
+ ocfs2_complete_edge_insert(inode, handle, left_path,
+ right_path, subtree_index);
+ }
out:
+ if (left_path)
+ ocfs2_free_path(left_path);
return ret;
}
static int ocfs2_try_to_merge_extent(struct inode *inode,
handle_t *handle,
- struct ocfs2_path *left_path,
+ struct ocfs2_path *path,
int split_index,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -2819,7 +3099,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
{
int ret = 0;
- struct ocfs2_extent_list *el = path_leaf_el(left_path);
+ struct ocfs2_extent_list *el = path_leaf_el(path);
struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
@@ -2832,7 +3112,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* extents - having more than one in a leaf is
* illegal.
*/
- ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+ ret = ocfs2_rotate_tree_left(inode, handle, path,
dealloc);
if (ret) {
mlog_errno(ret);
@@ -2847,7 +3127,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* Left-right contig implies this.
*/
BUG_ON(!ctxt->c_split_covers_rec);
- BUG_ON(split_index == 0);
/*
* Since the leftright insert always covers the entire
@@ -2858,9 +3137,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* Since the adding of an empty extent shifts
* everything back to the right, there's no need to
* update split_index here.
+ *
+ * When the split_index is zero, we need to merge it to the
+ * prevoius extent block. It is more efficient and easier
+ * if we do merge_right first and merge_left later.
*/
- ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
- handle, split_rec, el, split_index);
+ ret = ocfs2_merge_rec_right(inode, path,
+ handle, split_rec,
+ split_index);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2871,32 +3155,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
- /*
- * The left merge left us with an empty extent, remove
- * it.
- */
- ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
+ /* The merge left us with an empty extent, remove it. */
+ ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
}
- split_index--;
+
rec = &el->l_recs[split_index];
/*
* Note that we don't pass split_rec here on purpose -
- * we've merged it into the left side.
+ * we've merged it into the rec already.
*/
- ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
- handle, rec, el, split_index);
+ ret = ocfs2_merge_rec_left(inode, path,
+ handle, rec,
+ dealloc,
+ split_index);
+
if (ret) {
mlog_errno(ret);
goto out;
}
- BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
-
- ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+ ret = ocfs2_rotate_tree_left(inode, handle, path,
dealloc);
/*
* Error from this last rotate is not critical, so
@@ -2915,8 +3197,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
*/
if (ctxt->c_contig_type == CONTIG_RIGHT) {
ret = ocfs2_merge_rec_left(inode,
- path_leaf_bh(left_path),
- handle, split_rec, el,
+ path,
+ handle, split_rec,
+ dealloc,
split_index);
if (ret) {
mlog_errno(ret);
@@ -2924,8 +3207,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
}
} else {
ret = ocfs2_merge_rec_right(inode,
- path_leaf_bh(left_path),
- handle, split_rec, el,
+ path,
+ handle, split_rec,
split_index);
if (ret) {
mlog_errno(ret);
@@ -2938,7 +3221,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
*/
- ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+ ret = ocfs2_rotate_tree_left(inode, handle, path,
dealloc);
if (ret)
mlog_errno(ret);
@@ -3498,20 +3781,57 @@ out:
}
static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct inode *inode,
+ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
struct ocfs2_extent_list *el, int index,
struct ocfs2_extent_rec *split_rec)
{
- struct ocfs2_extent_rec *rec;
+ int status;
enum ocfs2_contig_type ret = CONTIG_NONE;
+ u32 left_cpos, right_cpos;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_extent_list *new_el;
+ struct ocfs2_path *left_path = NULL, *right_path = NULL;
+ struct buffer_head *bh;
+ struct ocfs2_extent_block *eb;
+
+ if (index > 0) {
+ rec = &el->l_recs[index - 1];
+ } else if (path->p_tree_depth > 0) {
+ status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+ path, &left_cpos);
+ if (status)
+ goto out;
+
+ if (left_cpos != 0) {
+ left_path = ocfs2_new_path(path_root_bh(path),
+ path_root_el(path));
+ if (!left_path)
+ goto out;
+
+ status = ocfs2_find_path(inode, left_path, left_cpos);
+ if (status)
+ goto out;
+
+ new_el = path_leaf_el(left_path);
+
+ if (le16_to_cpu(new_el->l_next_free_rec) !=
+ le16_to_cpu(new_el->l_count)) {
+ bh = path_leaf_bh(left_path);
+ eb = (struct ocfs2_extent_block *)bh->b_data;
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+ eb);
+ goto out;
+ }
+ rec = &new_el->l_recs[
+ le16_to_cpu(new_el->l_next_free_rec) - 1];
+ }
+ }
/*
* We're careful to check for an empty extent record here -
* the merge code will know what to do if it sees one.
*/
-
- if (index > 0) {
- rec = &el->l_recs[index - 1];
+ if (rec) {
if (index == 1 && ocfs2_is_empty_extent(rec)) {
if (split_rec->e_cpos == el->l_recs[index].e_cpos)
ret = CONTIG_RIGHT;
@@ -3520,10 +3840,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
}
}
- if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+ rec = NULL;
+ if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
+ rec = &el->l_recs[index + 1];
+ else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
+ path->p_tree_depth > 0) {
+ status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
+ path, &right_cpos);
+ if (status)
+ goto out;
+
+ if (right_cpos == 0)
+ goto out;
+
+ right_path = ocfs2_new_path(path_root_bh(path),
+ path_root_el(path));
+ if (!right_path)
+ goto out;
+
+ status = ocfs2_find_path(inode, right_path, right_cpos);
+ if (status)
+ goto out;
+
+ new_el = path_leaf_el(right_path);
+ rec = &new_el->l_recs[0];
+ if (ocfs2_is_empty_extent(rec)) {
+ if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
+ bh = path_leaf_bh(right_path);
+ eb = (struct ocfs2_extent_block *)bh->b_data;
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+ eb);
+ goto out;
+ }
+ rec = &new_el->l_recs[1];
+ }
+ }
+
+ if (rec) {
enum ocfs2_contig_type contig_type;
- rec = &el->l_recs[index + 1];
contig_type = ocfs2_extent_contig(inode, rec, split_rec);
if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
@@ -3532,6 +3887,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
ret = contig_type;
}
+out:
+ if (left_path)
+ ocfs2_free_path(left_path);
+ if (right_path)
+ ocfs2_free_path(right_path);
+
return ret;
}
@@ -3994,7 +4355,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
goto out;
}
- ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+ ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
split_index,
split_rec);
@@ -4788,6 +5149,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
mlog_errno(status);
+ else
+ ocfs2_init_inode_steal_slot(osb);
mlog_exit(status);
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 90383ed6100..17964c0505a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
unsigned to)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- handle_t *handle = NULL;
+ handle_t *handle;
int ret = 0;
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (!handle) {
+ if (IS_ERR(handle)) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
@@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
}
out:
if (ret) {
- if (handle)
+ if (!IS_ERR(handle))
ocfs2_commit_trans(osb, handle);
handle = ERR_PTR(ret);
}
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index cdd162f1365..bc8c5e7d860 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
- quorum.o tcp.o ver.o
+ quorum.o tcp.o netdebug.o ver.o
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 00000000000..7bf3c0ea7bd
--- /dev/null
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,441 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * netdebug.c
+ *
+ * debug functionality for o2net
+ *
+ * Copyright (C) 2005, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+
+#include <linux/uaccess.h>
+
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+
+#include "tcp_internal.h"
+
+#define O2NET_DEBUG_DIR "o2net"
+#define SC_DEBUG_NAME "sock_containers"
+#define NST_DEBUG_NAME "send_tracking"
+
+static struct dentry *o2net_dentry;
+static struct dentry *sc_dentry;
+static struct dentry *nst_dentry;
+
+static DEFINE_SPINLOCK(o2net_debug_lock);
+
+static LIST_HEAD(sock_containers);
+static LIST_HEAD(send_tracking);
+
+void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+ spin_lock(&o2net_debug_lock);
+ list_add(&nst->st_net_debug_item, &send_tracking);
+ spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+ spin_lock(&o2net_debug_lock);
+ if (!list_empty(&nst->st_net_debug_item))
+ list_del_init(&nst->st_net_debug_item);
+ spin_unlock(&o2net_debug_lock);
+}
+
+static struct o2net_send_tracking
+ *next_nst(struct o2net_send_tracking *nst_start)
+{
+ struct o2net_send_tracking *nst, *ret = NULL;
+
+ assert_spin_locked(&o2net_debug_lock);
+
+ list_for_each_entry(nst, &nst_start->st_net_debug_item,
+ st_net_debug_item) {
+ /* discover the head of the list */
+ if (&nst->st_net_debug_item == &send_tracking)
+ break;
+
+ /* use st_task to detect real nsts in the list */
+ if (nst->st_task != NULL) {
+ ret = nst;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+ spin_lock(&o2net_debug_lock);
+ nst = next_nst(dummy_nst);
+ spin_unlock(&o2net_debug_lock);
+
+ return nst;
+}
+
+static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+ spin_lock(&o2net_debug_lock);
+ nst = next_nst(dummy_nst);
+ list_del_init(&dummy_nst->st_net_debug_item);
+ if (nst)
+ list_add(&dummy_nst->st_net_debug_item,
+ &nst->st_net_debug_item);
+ spin_unlock(&o2net_debug_lock);
+
+ return nst; /* unused, just needs to be null when done */
+}
+
+static int nst_seq_show(struct seq_file *seq, void *v)
+{
+ struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+ spin_lock(&o2net_debug_lock);
+ nst = next_nst(dummy_nst);
+
+ if (nst != NULL) {
+ /* get_task_comm isn't exported. oh well. */
+ seq_printf(seq, "%p:\n"
+ " pid: %lu\n"
+ " tgid: %lu\n"
+ " process name: %s\n"
+ " node: %u\n"
+ " sc: %p\n"
+ " message id: %d\n"
+ " message type: %u\n"
+ " message key: 0x%08x\n"
+ " sock acquiry: %lu.%lu\n"
+ " send start: %lu.%lu\n"
+ " wait start: %lu.%lu\n",
+ nst, (unsigned long)nst->st_task->pid,
+ (unsigned long)nst->st_task->tgid,
+ nst->st_task->comm, nst->st_node,
+ nst->st_sc, nst->st_id, nst->st_msg_type,
+ nst->st_msg_key,
+ nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
+ nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
+ nst->st_status_time.tv_sec,
+ nst->st_status_time.tv_usec);
+ }
+
+ spin_unlock(&o2net_debug_lock);
+
+ return 0;
+}
+
+static void nst_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations nst_seq_ops = {
+ .start = nst_seq_start,
+ .next = nst_seq_next,
+ .stop = nst_seq_stop,
+ .show = nst_seq_show,
+};
+
+static int nst_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_send_tracking *dummy_nst;
+ struct seq_file *seq;
+ int ret;
+
+ dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
+ if (dummy_nst == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dummy_nst->st_task = NULL;
+
+ ret = seq_open(file, &nst_seq_ops);
+ if (ret)
+ goto out;
+
+ seq = file->private_data;
+ seq->private = dummy_nst;
+ o2net_debug_add_nst(dummy_nst);
+
+ dummy_nst = NULL;
+
+out:
+ kfree(dummy_nst);
+ return ret;
+}
+
+static int nst_fop_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct o2net_send_tracking *dummy_nst = seq->private;
+
+ o2net_debug_del_nst(dummy_nst);
+ return seq_release_private(inode, file);
+}
+
+static struct file_operations nst_seq_fops = {
+ .open = nst_fop_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = nst_fop_release,
+};
+
+void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+ spin_lock(&o2net_debug_lock);
+ list_add(&sc->sc_net_debug_item, &sock_containers);
+ spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+ spin_lock(&o2net_debug_lock);
+ list_del_init(&sc->sc_net_debug_item);
+ spin_unlock(&o2net_debug_lock);
+}
+
+static struct o2net_sock_container
+ *next_sc(struct o2net_sock_container *sc_start)
+{
+ struct o2net_sock_container *sc, *ret = NULL;
+
+ assert_spin_locked(&o2net_debug_lock);
+
+ list_for_each_entry(sc, &sc_start->sc_net_debug_item,
+ sc_net_debug_item) {
+ /* discover the head of the list miscast as a sc */
+ if (&sc->sc_net_debug_item == &sock_containers)
+ break;
+
+ /* use sc_page to detect real scs in the list */
+ if (sc->sc_page != NULL) {
+ ret = sc;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct o2net_sock_container *sc, *dummy_sc = seq->private;
+
+ spin_lock(&o2net_debug_lock);
+ sc = next_sc(dummy_sc);
+ spin_unlock(&o2net_debug_lock);
+
+ return sc;
+}
+
+static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct o2net_sock_container *sc, *dummy_sc = seq->private;
+
+ spin_lock(&o2net_debug_lock);
+ sc = next_sc(dummy_sc);
+ list_del_init(&dummy_sc->sc_net_debug_item);
+ if (sc)
+ list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
+ spin_unlock(&o2net_debug_lock);
+
+ return sc; /* unused, just needs to be null when done */
+}
+
+#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
+
+static int sc_seq_show(struct seq_file *seq, void *v)
+{
+ struct o2net_sock_container *sc, *dummy_sc = seq->private;
+
+ spin_lock(&o2net_debug_lock);
+ sc = next_sc(dummy_sc);
+
+ if (sc != NULL) {
+ struct inet_sock *inet = NULL;
+
+ __be32 saddr = 0, daddr = 0;
+ __be16 sport = 0, dport = 0;
+
+ if (sc->sc_sock) {
+ inet = inet_sk(sc->sc_sock->sk);
+ /* the stack's structs aren't sparse endian clean */
+ saddr = (__force __be32)inet->saddr;
+ daddr = (__force __be32)inet->daddr;
+ sport = (__force __be16)inet->sport;
+ dport = (__force __be16)inet->dport;
+ }
+
+ /* XXX sigh, inet-> doesn't have sparse annotation so any
+ * use of it here generates a warning with -Wbitwise */
+ seq_printf(seq, "%p:\n"
+ " krefs: %d\n"
+ " sock: %u.%u.%u.%u:%u -> "
+ "%u.%u.%u.%u:%u\n"
+ " remote node: %s\n"
+ " page off: %zu\n"
+ " handshake ok: %u\n"
+ " timer: %lu.%lu\n"
+ " data ready: %lu.%lu\n"
+ " advance start: %lu.%lu\n"
+ " advance stop: %lu.%lu\n"
+ " func start: %lu.%lu\n"
+ " func stop: %lu.%lu\n"
+ " func key: %u\n"
+ " func type: %u\n",
+ sc,
+ atomic_read(&sc->sc_kref.refcount),
+ NIPQUAD(saddr), inet ? ntohs(sport) : 0,
+ NIPQUAD(daddr), inet ? ntohs(dport) : 0,
+ sc->sc_node->nd_name,
+ sc->sc_page_off,
+ sc->sc_handshake_ok,
+ TV_SEC_USEC(sc->sc_tv_timer),
+ TV_SEC_USEC(sc->sc_tv_data_ready),
+ TV_SEC_USEC(sc->sc_tv_advance_start),
+ TV_SEC_USEC(sc->sc_tv_advance_stop),
+ TV_SEC_USEC(sc->sc_tv_func_start),
+ TV_SEC_USEC(sc->sc_tv_func_stop),
+ sc->sc_msg_key,
+ sc->sc_msg_type);
+ }
+
+
+ spin_unlock(&o2net_debug_lock);
+
+ return 0;
+}
+
+static void sc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations sc_seq_ops = {
+ .start = sc_seq_start,
+ .next = sc_seq_next,
+ .stop = sc_seq_stop,
+ .show = sc_seq_show,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_sock_container *dummy_sc;
+ struct seq_file *seq;
+ int ret;
+
+ dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
+ if (dummy_sc == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dummy_sc->sc_page = NULL;
+
+ ret = seq_open(file, &sc_seq_ops);
+ if (ret)
+ goto out;
+
+ seq = file->private_data;
+ seq->private = dummy_sc;
+ o2net_debug_add_sc(dummy_sc);
+
+ dummy_sc = NULL;
+
+out:
+ kfree(dummy_sc);
+ return ret;
+}
+
+static int sc_fop_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct o2net_sock_container *dummy_sc = seq->private;
+
+ o2net_debug_del_sc(dummy_sc);
+ return seq_release_private(inode, file);
+}
+
+static struct file_operations sc_seq_fops = {
+ .open = sc_fop_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = sc_fop_release,
+};
+
+int o2net_debugfs_init(void)
+{
+ o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+ if (!o2net_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
+ o2net_dentry, NULL,
+ &nst_seq_fops);
+ if (!nst_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
+ o2net_dentry, NULL,
+ &sc_seq_fops);
+ if (!sc_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ return 0;
+bail:
+ if (sc_dentry)
+ debugfs_remove(sc_dentry);
+ if (nst_dentry)
+ debugfs_remove(nst_dentry);
+ if (o2net_dentry)
+ debugfs_remove(o2net_dentry);
+ return -ENOMEM;
+}
+
+void o2net_debugfs_exit(void)
+{
+ if (sc_dentry)
+ debugfs_remove(sc_dentry);
+ if (nst_dentry)
+ debugfs_remove(nst_dentry);
+ if (o2net_dentry)
+ debugfs_remove(o2net_dentry);
+}
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 709fba25bf7..cf9401e8cd0 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -959,7 +959,10 @@ static int __init init_o2nm(void)
cluster_print_version();
o2hb_init();
- o2net_init();
+
+ ret = o2net_init();
+ if (ret)
+ goto out;
ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
if (!ocfs2_table_header) {
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 0c095ce7723..98429fd6849 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset;
void o2cb_sys_shutdown(void)
{
mlog_sys_shutdown();
+ sysfs_remove_link(NULL, "o2cb");
kset_unregister(o2cb_kset);
}
@@ -68,6 +69,14 @@ int o2cb_sys_init(void)
if (!o2cb_kset)
return -ENOMEM;
+ /*
+ * Create this symlink for backwards compatibility with old
+ * versions of ocfs2-tools which look for things in /sys/o2cb.
+ */
+ ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
+ if (ret)
+ goto error;
+
ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
if (ret)
goto error;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index b8057c51b20..1e44ad14881 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data);
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
-/*
- * FIXME: These should use to_o2nm_cluster_from_node(), but we end up
- * losing our parent link to the cluster during shutdown. This can be
- * solved by adding a pre-removal callback to configfs, or passing
- * around the cluster with the node. -jeffm
- */
-static inline int o2net_reconnect_delay(struct o2nm_node *node)
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+ u32 msgkey, struct task_struct *task, u8 node)
+{
+#ifdef CONFIG_DEBUG_FS
+ INIT_LIST_HEAD(&nst->st_net_debug_item);
+ nst->st_task = task;
+ nst->st_msg_type = msgtype;
+ nst->st_msg_key = msgkey;
+ nst->st_node = node;
+#endif
+}
+
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+ do_gettimeofday(&nst->st_sock_time);
+#endif
+}
+
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+ do_gettimeofday(&nst->st_send_time);
+#endif
+}
+
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+ do_gettimeofday(&nst->st_status_time);
+#endif
+}
+
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
+{
+#ifdef CONFIG_DEBUG_FS
+ nst->st_sc = sc;
+#endif
+}
+
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+{
+#ifdef CONFIG_DEBUG_FS
+ nst->st_id = msg_id;
+#endif
+}
+
+static inline int o2net_reconnect_delay(void)
{
return o2nm_single_cluster->cl_reconnect_delay_ms;
}
-static inline int o2net_keepalive_delay(struct o2nm_node *node)
+static inline int o2net_keepalive_delay(void)
{
return o2nm_single_cluster->cl_keepalive_delay_ms;
}
-static inline int o2net_idle_timeout(struct o2nm_node *node)
+static inline int o2net_idle_timeout(void)
{
return o2nm_single_cluster->cl_idle_timeout_ms;
}
@@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref)
o2nm_node_put(sc->sc_node);
sc->sc_node = NULL;
+ o2net_debug_del_sc(sc);
kfree(sc);
}
@@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
ret = sc;
sc->sc_page = page;
+ o2net_debug_add_sc(sc);
sc = NULL;
page = NULL;
@@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
- /* we won't reconnect after our valid conn goes away for
- * this hb iteration.. here so it shows up in the logs */
if (was_valid && !valid && err == 0)
err = -ENOTCONN;
@@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
if (!was_valid && valid) {
o2quo_conn_up(o2net_num_from_nn(nn));
- /* this is a bit of a hack. we only try reconnecting
- * when heartbeating starts until we get a connection.
- * if that connection then dies we don't try reconnecting.
- * the only way to start connecting again is to down
- * heartbeat and bring it back up. */
cancel_delayed_work(&nn->nn_connect_expired);
printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
o2nm_this_node() > sc->sc_node->nd_num ?
@@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn,
/* delay if we're withing a RECONNECT_DELAY of the
* last attempt */
delay = (nn->nn_last_connect_attempt +
- msecs_to_jiffies(o2net_reconnect_delay(NULL)))
+ msecs_to_jiffies(o2net_reconnect_delay()))
- jiffies;
- if (delay > msecs_to_jiffies(o2net_reconnect_delay(NULL)))
+ if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
delay = 0;
mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+
+ /*
+ * Delay the expired work after idle timeout.
+ *
+ * We might have lots of failed connection attempts that run
+ * through here but we only cancel the connect_expired work when
+ * a connection attempt succeeds. So only the first enqueue of
+ * the connect_expired work will do anything. The rest will see
+ * that it's already queued and do nothing.
+ */
+ delay += msecs_to_jiffies(o2net_idle_timeout());
+ queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
}
/* keep track of the nn's sc ref for the caller */
@@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
struct o2net_status_wait nsw = {
.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
};
+ struct o2net_send_tracking nst;
+
+ o2net_init_nst(&nst, msg_type, key, current, target_node);
if (o2net_wq == NULL) {
mlog(0, "attempt to tx without o2netd running\n");
@@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
goto out;
}
+ o2net_debug_add_nst(&nst);
+
+ o2net_set_nst_sock_time(&nst);
+
ret = wait_event_interruptible(nn->nn_sc_wq,
o2net_tx_can_proceed(nn, &sc, &error));
if (!ret && error)
@@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
if (ret)
goto out;
+ o2net_set_nst_sock_container(&nst, sc);
+
veclen = caller_veclen + 1;
vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
if (vec == NULL) {
@@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
goto out;
msg->msg_num = cpu_to_be32(nsw.ns_id);
+ o2net_set_nst_msg_id(&nst, nsw.ns_id);
+
+ o2net_set_nst_send_time(&nst);
/* finally, convert the message header to network byte-order
* and send */
@@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
}
/* wait on other node's handler */
+ o2net_set_nst_status_time(&nst);
wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
/* Note that we avoid overwriting the callers status return
@@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
mlog(0, "woken, returning system status %d, user status %d\n",
ret, nsw.ns_status);
out:
+ o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
if (sc)
sc_put(sc);
if (vec)
@@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
* but isn't. This can ultimately cause corruption.
*/
if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
- o2net_idle_timeout(sc->sc_node)) {
+ o2net_idle_timeout()) {
mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
"%u ms, but we use %u ms locally. disconnecting\n",
SC_NODEF_ARGS(sc),
be32_to_cpu(hand->o2net_idle_timeout_ms),
- o2net_idle_timeout(sc->sc_node));
+ o2net_idle_timeout());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
- o2net_keepalive_delay(sc->sc_node)) {
+ o2net_keepalive_delay()) {
mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
"%u ms, but we use %u ms locally. disconnecting\n",
SC_NODEF_ARGS(sc),
be32_to_cpu(hand->o2net_keepalive_delay_ms),
- o2net_keepalive_delay(sc->sc_node));
+ o2net_keepalive_delay());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
@@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
* shut down already */
if (nn->nn_sc == sc) {
o2net_sc_reset_idle_timer(sc);
+ atomic_set(&nn->nn_timeout, 0);
o2net_set_nn_state(nn, sc, 1, 0);
}
spin_unlock(&nn->nn_lock);
@@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void)
{
o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
O2HB_MAX_WRITE_TIMEOUT_MS);
- o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(
- o2net_idle_timeout(NULL));
+ o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
- o2net_keepalive_delay(NULL));
+ o2net_keepalive_delay());
o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
- o2net_reconnect_delay(NULL));
+ o2net_reconnect_delay());
}
/* ------------------------------------------------------------ */
@@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
static void o2net_idle_timer(unsigned long data)
{
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+ struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
struct timeval now;
do_gettimeofday(&now);
printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
"seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
- o2net_idle_timeout(sc->sc_node) / 1000,
- o2net_idle_timeout(sc->sc_node) % 1000);
+ o2net_idle_timeout() / 1000,
+ o2net_idle_timeout() % 1000);
mlog(ML_NOTICE, "here are some times that might help debug the "
"situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
"%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
@@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data)
sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+ /*
+ * Initialize the nn_timeout so that the next connection attempt
+ * will continue in o2net_start_connect.
+ */
+ atomic_set(&nn->nn_timeout, 1);
+
o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
}
@@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
{
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
- msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node)));
+ msecs_to_jiffies(o2net_keepalive_delay()));
do_gettimeofday(&sc->sc_tv_timer);
mod_timer(&sc->sc_idle_timeout,
- jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node)));
+ jiffies + msecs_to_jiffies(o2net_idle_timeout()));
}
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
@@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work)
struct socket *sock = NULL;
struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
int ret = 0, stop;
+ unsigned int timeout;
/* if we're greater we initiate tx, otherwise we accept */
if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work)
}
spin_lock(&nn->nn_lock);
- /* see if we already have one pending or have given up */
- stop = (nn->nn_sc || nn->nn_persistent_error);
+ /*
+ * see if we already have one pending or have given up.
+ * For nn_timeout, it is set when we close the connection
+ * because of the idle time out. So it means that we have
+ * at least connected to that node successfully once,
+ * now try to connect to it again.
+ */
+ timeout = atomic_read(&nn->nn_timeout);
+ stop = (nn->nn_sc ||
+ (nn->nn_persistent_error &&
+ (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
spin_unlock(&nn->nn_lock);
if (stop)
goto out;
@@ -1555,8 +1635,8 @@ static void o2net_connect_expired(struct work_struct *work)
mlog(ML_ERROR, "no connection established with node %u after "
"%u.%u seconds, giving up and returning errors.\n",
o2net_num_from_nn(nn),
- o2net_idle_timeout(NULL) / 1000,
- o2net_idle_timeout(NULL) % 1000);
+ o2net_idle_timeout() / 1000,
+ o2net_idle_timeout() % 1000);
o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
}
@@ -1579,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
/* don't reconnect until it's heartbeating again */
spin_lock(&nn->nn_lock);
+ atomic_set(&nn->nn_timeout, 0);
o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
spin_unlock(&nn->nn_lock);
@@ -1610,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
/* ensure an immediate connect attempt */
nn->nn_last_connect_attempt = jiffies -
- (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
+ (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
if (node_num != o2nm_this_node()) {
- /* heartbeat doesn't work unless a local node number is
- * configured and doing so brings up the o2net_wq, so we can
- * use it.. */
- queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
- msecs_to_jiffies(o2net_idle_timeout(node)));
-
/* believe it or not, accept and node hearbeating testing
* can succeed for this node before we got here.. so
* only use set_nn_state to clear the persistent error
* if that hasn't already happened */
spin_lock(&nn->nn_lock);
+ atomic_set(&nn->nn_timeout, 0);
if (nn->nn_persistent_error)
o2net_set_nn_state(nn, NULL, 0, 0);
spin_unlock(&nn->nn_lock);
@@ -1747,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock)
new_sock = NULL;
spin_lock(&nn->nn_lock);
+ atomic_set(&nn->nn_timeout, 0);
o2net_set_nn_state(nn, sc, 0, 0);
spin_unlock(&nn->nn_lock);
@@ -1922,6 +1999,9 @@ int o2net_init(void)
o2quo_init();
+ if (o2net_debugfs_init())
+ return -ENOMEM;
+
o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1941,6 +2021,7 @@ int o2net_init(void)
for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
struct o2net_node *nn = o2net_nn_from_num(i);
+ atomic_set(&nn->nn_timeout, 0);
spin_lock_init(&nn->nn_lock);
INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
INIT_DELAYED_WORK(&nn->nn_connect_expired,
@@ -1962,4 +2043,5 @@ void o2net_exit(void)
kfree(o2net_hand);
kfree(o2net_keep_req);
kfree(o2net_keep_resp);
+ o2net_debugfs_exit();
}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index f36f66aab3d..a705d5d1903 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -117,4 +117,36 @@ int o2net_num_connected_peers(void);
int o2net_init(void);
void o2net_exit(void);
+struct o2net_send_tracking;
+struct o2net_sock_container;
+
+#ifdef CONFIG_DEBUG_FS
+int o2net_debugfs_init(void);
+void o2net_debugfs_exit(void);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst);
+void o2net_debug_del_nst(struct o2net_send_tracking *nst);
+void o2net_debug_add_sc(struct o2net_sock_container *sc);
+void o2net_debug_del_sc(struct o2net_sock_container *sc);
+#else
+static int o2net_debugfs_init(void)
+{
+ return 0;
+}
+static void o2net_debugfs_exit(void)
+{
+}
+static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+}
+static void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index d25b9af2850..8d58cfe410b 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,6 +95,8 @@ struct o2net_node {
unsigned nn_sc_valid:1;
/* if this is set tx just returns it */
int nn_persistent_error;
+ /* It is only set to 1 after the idle time out. */
+ atomic_t nn_timeout;
/* threads waiting for an sc to arrive wait on the wq for generation
* to increase. it is increased when a connecting socket succeeds
@@ -164,7 +166,9 @@ struct o2net_sock_container {
/* original handlers for the sockets */
void (*sc_state_change)(struct sock *sk);
void (*sc_data_ready)(struct sock *sk, int bytes);
-
+#ifdef CONFIG_DEBUG_FS
+ struct list_head sc_net_debug_item;
+#endif
struct timeval sc_tv_timer;
struct timeval sc_tv_data_ready;
struct timeval sc_tv_advance_start;
@@ -206,4 +210,24 @@ struct o2net_status_wait {
struct list_head ns_node_item;
};
+#ifdef CONFIG_DEBUG_FS
+/* just for state dumps */
+struct o2net_send_tracking {
+ struct list_head st_net_debug_item;
+ struct task_struct *st_task;
+ struct o2net_sock_container *st_sc;
+ u32 st_id;
+ u32 st_msg_type;
+ u32 st_msg_key;
+ u8 st_node;
+ struct timeval st_sock_time;
+ struct timeval st_send_time;
+ struct timeval st_status_time;
+};
+#else
+struct o2net_send_tracking {
+ u32 dummy;
+};
+#endif /* CONFIG_DEBUG_FS */
+
#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index ce3f7c29d27..19036137570 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,6 @@
EXTRA_CFLAGS += -Ifs/ocfs2
-obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index dc8ea666efd..d5a86fb81a4 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -49,6 +49,41 @@
/* Intended to make it easier for us to switch out hash functions */
#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+enum dlm_mle_type {
+ DLM_MLE_BLOCK,
+ DLM_MLE_MASTER,
+ DLM_MLE_MIGRATION
+};
+
+struct dlm_lock_name {
+ u8 len;
+ u8 name[DLM_LOCKID_NAME_MAX];
+};
+
+struct dlm_master_list_entry {
+ struct list_head list;
+ struct list_head hb_events;
+ struct dlm_ctxt *dlm;
+ spinlock_t spinlock;
+ wait_queue_head_t wq;
+ atomic_t woken;
+ struct kref mle_refs;
+ int inuse;
+ unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ u8 master;
+ u8 new_master;
+ enum dlm_mle_type type;
+ struct o2hb_callback_func mle_hb_up;
+ struct o2hb_callback_func mle_hb_down;
+ union {
+ struct dlm_lock_resource *res;
+ struct dlm_lock_name name;
+ } u;
+};
+
enum dlm_ast_type {
DLM_AST = 0,
DLM_BAST,
@@ -101,6 +136,7 @@ struct dlm_ctxt
struct list_head purge_list;
struct list_head pending_asts;
struct list_head pending_basts;
+ struct list_head tracking_list;
unsigned int purge_count;
spinlock_t spinlock;
spinlock_t ast_lock;
@@ -122,6 +158,9 @@ struct dlm_ctxt
atomic_t remote_resources;
atomic_t unknown_resources;
+ struct dlm_debug_ctxt *dlm_debug_ctxt;
+ struct dentry *dlm_debugfs_subroot;
+
/* NOTE: Next three are protected by dlm_domain_lock */
struct kref dlm_refs;
enum dlm_ctxt_state dlm_state;
@@ -270,6 +309,9 @@ struct dlm_lock_resource
struct list_head dirty;
struct list_head recovering; // dlm_recovery_ctxt.resources list
+ /* Added during init and removed during release */
+ struct list_head tracking; /* dlm->tracking_list */
+
/* unused lock resources have their last_used stamped and are
* put on a list for the dlm thread to run. */
unsigned long last_used;
@@ -963,9 +1005,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
DLM_LOCK_RES_MIGRATING));
}
+/* create/destroy slab caches */
+int dlm_init_master_caches(void);
+void dlm_destroy_master_caches(void);
+
+int dlm_init_lock_cache(void);
+void dlm_destroy_lock_cache(void);
int dlm_init_mle_cache(void);
void dlm_destroy_mle_cache(void);
+
void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 64239b37e5d..5f6d858770a 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -5,7 +5,7 @@
*
* debug functionality for the dlm
*
- * Copyright (C) 2004 Oracle. All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -30,6 +30,7 @@
#include <linux/utsname.h>
#include <linux/sysctl.h>
#include <linux/spinlock.h>
+#include <linux/debugfs.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
@@ -37,17 +38,16 @@
#include "dlmapi.h"
#include "dlmcommon.h"
-
#include "dlmdomain.h"
+#include "dlmdebug.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len);
+
void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
{
- mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
- res->lockname.len, res->lockname.name,
- res->owner, res->state);
spin_lock(&res->spinlock);
__dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
@@ -58,7 +58,7 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
int bit;
assert_spin_locked(&res->spinlock);
- mlog(ML_NOTICE, " refmap nodes: [ ");
+ printk(" refmap nodes: [ ");
bit = 0;
while (1) {
bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
@@ -70,63 +70,66 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
printk("], inflight=%u\n", res->inflight_locks);
}
+static void __dlm_print_lock(struct dlm_lock *lock)
+{
+ spin_lock(&lock->spinlock);
+
+ printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, "
+ "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
+ "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+ lock->ml.type, lock->ml.convert_type, lock->ml.node,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ atomic_read(&lock->lock_refs.refcount),
+ (list_empty(&lock->ast_list) ? 'y' : 'n'),
+ (lock->ast_pending ? 'y' : 'n'),
+ (list_empty(&lock->bast_list) ? 'y' : 'n'),
+ (lock->bast_pending ? 'y' : 'n'),
+ (lock->convert_pending ? 'y' : 'n'),
+ (lock->lock_pending ? 'y' : 'n'),
+ (lock->cancel_pending ? 'y' : 'n'),
+ (lock->unlock_pending ? 'y' : 'n'));
+
+ spin_unlock(&lock->spinlock);
+}
+
void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
{
struct list_head *iter2;
struct dlm_lock *lock;
+ char buf[DLM_LOCKID_NAME_MAX];
assert_spin_locked(&res->spinlock);
- mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
- res->lockname.len, res->lockname.name,
- res->owner, res->state);
- mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n",
- res->last_used, list_empty(&res->purge) ? "no" : "yes");
+ stringify_lockname(res->lockname.name, res->lockname.len,
+ buf, sizeof(buf) - 1);
+ printk("lockres: %s, owner=%u, state=%u\n",
+ buf, res->owner, res->state);
+ printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
+ res->last_used, atomic_read(&res->refs.refcount),
+ list_empty(&res->purge) ? "no" : "yes");
+ printk(" on dirty list: %s, on reco list: %s, "
+ "migrating pending: %s\n",
+ list_empty(&res->dirty) ? "no" : "yes",
+ list_empty(&res->recovering) ? "no" : "yes",
+ res->migration_pending ? "yes" : "no");
+ printk(" inflight locks: %d, asts reserved: %d\n",
+ res->inflight_locks, atomic_read(&res->asts_reserved));
dlm_print_lockres_refmap(res);
- mlog(ML_NOTICE, " granted queue: \n");
+ printk(" granted queue:\n");
list_for_each(iter2, &res->granted) {
lock = list_entry(iter2, struct dlm_lock, list);
- spin_lock(&lock->spinlock);
- mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
- "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
- lock->ml.type, lock->ml.convert_type, lock->ml.node,
- dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
- dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
- list_empty(&lock->ast_list) ? 'y' : 'n',
- lock->ast_pending ? 'y' : 'n',
- list_empty(&lock->bast_list) ? 'y' : 'n',
- lock->bast_pending ? 'y' : 'n');
- spin_unlock(&lock->spinlock);
+ __dlm_print_lock(lock);
}
- mlog(ML_NOTICE, " converting queue: \n");
+ printk(" converting queue:\n");
list_for_each(iter2, &res->converting) {
lock = list_entry(iter2, struct dlm_lock, list);
- spin_lock(&lock->spinlock);
- mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
- "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
- lock->ml.type, lock->ml.convert_type, lock->ml.node,
- dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
- dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
- list_empty(&lock->ast_list) ? 'y' : 'n',
- lock->ast_pending ? 'y' : 'n',
- list_empty(&lock->bast_list) ? 'y' : 'n',
- lock->bast_pending ? 'y' : 'n');
- spin_unlock(&lock->spinlock);
+ __dlm_print_lock(lock);
}
- mlog(ML_NOTICE, " blocked queue: \n");
+ printk(" blocked queue:\n");
list_for_each(iter2, &res->blocked) {
lock = list_entry(iter2, struct dlm_lock, list);
- spin_lock(&lock->spinlock);
- mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
- "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
- lock->ml.type, lock->ml.convert_type, lock->ml.node,
- dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
- dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
- list_empty(&lock->ast_list) ? 'y' : 'n',
- lock->ast_pending ? 'y' : 'n',
- list_empty(&lock->bast_list) ? 'y' : 'n',
- lock->bast_pending ? 'y' : 'n');
- spin_unlock(&lock->spinlock);
+ __dlm_print_lock(lock);
}
}
@@ -136,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
}
EXPORT_SYMBOL_GPL(dlm_print_one_lock);
-#if 0
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
-{
- struct dlm_lock_resource *res;
- struct hlist_node *iter;
- struct hlist_head *bucket;
- int i;
-
- mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
- dlm->name, dlm->node_num, dlm->key);
- if (!dlm || !dlm->name) {
- mlog(ML_ERROR, "dlm=%p\n", dlm);
- return;
- }
-
- spin_lock(&dlm->spinlock);
- for (i=0; i<DLM_HASH_BUCKETS; i++) {
- bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, iter, bucket, hash_node)
- dlm_print_one_lock_resource(res);
- }
- spin_unlock(&dlm->spinlock);
-}
-#endif /* 0 */
-
static const char *dlm_errnames[] = {
[DLM_NORMAL] = "DLM_NORMAL",
[DLM_GRANTED] = "DLM_GRANTED",
@@ -266,3 +244,792 @@ const char *dlm_errname(enum dlm_status err)
return dlm_errnames[err];
}
EXPORT_SYMBOL_GPL(dlm_errname);
+
+/* NOTE: This function converts a lockname into a string. It uses knowledge
+ * of the format of the lockname that should be outside the purview of the dlm.
+ * We are adding only to make dlm debugging slightly easier.
+ *
+ * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
+ */
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len)
+{
+ int out = 0;
+ __be64 inode_blkno_be;
+
+#define OCFS2_DENTRY_LOCK_INO_START 18
+ if (*lockname == 'N') {
+ memcpy((__be64 *)&inode_blkno_be,
+ (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
+ sizeof(__be64));
+ out += snprintf(buf + out, len - out, "%.*s%08x",
+ OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
+ (unsigned int)be64_to_cpu(inode_blkno_be));
+ } else
+ out += snprintf(buf + out, len - out, "%.*s",
+ locklen, lockname);
+ return out;
+}
+
+static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
+ char *buf, int len)
+{
+ int out = 0;
+ int i = -1;
+
+ while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
+ out += snprintf(buf + out, len - out, "%d ", i);
+
+ return out;
+}
+
+static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
+{
+ int out = 0;
+ unsigned int namelen;
+ const char *name;
+ char *mle_type;
+
+ if (mle->type != DLM_MLE_MASTER) {
+ namelen = mle->u.name.len;
+ name = mle->u.name.name;
+ } else {
+ namelen = mle->u.res->lockname.len;
+ name = mle->u.res->lockname.name;
+ }
+
+ if (mle->type == DLM_MLE_BLOCK)
+ mle_type = "BLK";
+ else if (mle->type == DLM_MLE_MASTER)
+ mle_type = "MAS";
+ else
+ mle_type = "MIG";
+
+ out += stringify_lockname(name, namelen, buf + out, len - out);
+ out += snprintf(buf + out, len - out,
+ "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
+ mle_type, mle->master, mle->new_master,
+ !list_empty(&mle->hb_events),
+ !!mle->inuse,
+ atomic_read(&mle->mle_refs.refcount));
+
+ out += snprintf(buf + out, len - out, "Maybe=");
+ out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ out += snprintf(buf + out, len - out, "Vote=");
+ out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ out += snprintf(buf + out, len - out, "Response=");
+ out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ out += snprintf(buf + out, len - out, "Node=");
+ out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ out += snprintf(buf + out, len - out, "\n");
+
+ return out;
+}
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+ char *buf;
+
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (buf) {
+ dump_mle(mle, buf, PAGE_SIZE - 1);
+ free_page((unsigned long)buf);
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static struct dentry *dlm_debugfs_root = NULL;
+
+#define DLM_DEBUGFS_DIR "o2dlm"
+#define DLM_DEBUGFS_DLM_STATE "dlm_state"
+#define DLM_DEBUGFS_LOCKING_STATE "locking_state"
+#define DLM_DEBUGFS_MLE_STATE "mle_state"
+#define DLM_DEBUGFS_PURGE_LIST "purge_list"
+
+/* begin - utils funcs */
+static void dlm_debug_free(struct kref *kref)
+{
+ struct dlm_debug_ctxt *dc;
+
+ dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
+
+ kfree(dc);
+}
+
+void dlm_debug_put(struct dlm_debug_ctxt *dc)
+{
+ if (dc)
+ kref_put(&dc->debug_refcnt, dlm_debug_free);
+}
+
+static void dlm_debug_get(struct dlm_debug_ctxt *dc)
+{
+ kref_get(&dc->debug_refcnt);
+}
+
+static struct debug_buffer *debug_buffer_allocate(void)
+{
+ struct debug_buffer *db = NULL;
+
+ db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
+ if (!db)
+ goto bail;
+
+ db->len = PAGE_SIZE;
+ db->buf = kmalloc(db->len, GFP_KERNEL);
+ if (!db->buf)
+ goto bail;
+
+ return db;
+bail:
+ kfree(db);
+ return NULL;
+}
+
+static ssize_t debug_buffer_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct debug_buffer *db = file->private_data;
+
+ return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
+}
+
+static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
+{
+ struct debug_buffer *db = file->private_data;
+ loff_t new = -1;
+
+ switch (whence) {
+ case 0:
+ new = off;
+ break;
+ case 1:
+ new = file->f_pos + off;
+ break;
+ }
+
+ if (new < 0 || new > db->len)
+ return -EINVAL;
+
+ return (file->f_pos = new);
+}
+
+static int debug_buffer_release(struct inode *inode, struct file *file)
+{
+ struct debug_buffer *db = (struct debug_buffer *)file->private_data;
+
+ if (db)
+ kfree(db->buf);
+ kfree(db);
+
+ return 0;
+}
+/* end - util funcs */
+
+/* begin - purge list funcs */
+static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+ struct dlm_lock_resource *res;
+ int out = 0;
+ unsigned long total = 0;
+
+ out += snprintf(db->buf + out, db->len - out,
+ "Dumping Purgelist for Domain: %s\n", dlm->name);
+
+ spin_lock(&dlm->spinlock);
+ list_for_each_entry(res, &dlm->purge_list, purge) {
+ ++total;
+ if (db->len - out < 100)
+ continue;
+ spin_lock(&res->spinlock);
+ out += stringify_lockname(res->lockname.name,
+ res->lockname.len,
+ db->buf + out, db->len - out);
+ out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+ (jiffies - res->last_used)/HZ);
+ spin_unlock(&res->spinlock);
+ }
+ spin_unlock(&dlm->spinlock);
+
+ out += snprintf(db->buf + out, db->len - out,
+ "Total on list: %ld\n", total);
+
+ return out;
+}
+
+static int debug_purgelist_open(struct inode *inode, struct file *file)
+{
+ struct dlm_ctxt *dlm = inode->i_private;
+ struct debug_buffer *db;
+
+ db = debug_buffer_allocate();
+ if (!db)
+ goto bail;
+
+ db->len = debug_purgelist_print(dlm, db);
+
+ file->private_data = db;
+
+ return 0;
+bail:
+ return -ENOMEM;
+}
+
+static struct file_operations debug_purgelist_fops = {
+ .open = debug_purgelist_open,
+ .release = debug_buffer_release,
+ .read = debug_buffer_read,
+ .llseek = debug_buffer_llseek,
+};
+/* end - purge list funcs */
+
+/* begin - debug mle funcs */
+static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+ struct dlm_master_list_entry *mle;
+ int out = 0;
+ unsigned long total = 0;
+
+ out += snprintf(db->buf + out, db->len - out,
+ "Dumping MLEs for Domain: %s\n", dlm->name);
+
+ spin_lock(&dlm->master_lock);
+ list_for_each_entry(mle, &dlm->master_list, list) {
+ ++total;
+ if (db->len - out < 200)
+ continue;
+ out += dump_mle(mle, db->buf + out, db->len - out);
+ }
+ spin_unlock(&dlm->master_lock);
+
+ out += snprintf(db->buf + out, db->len - out,
+ "Total on list: %ld\n", total);
+ return out;
+}
+
+static int debug_mle_open(struct inode *inode, struct file *file)
+{
+ struct dlm_ctxt *dlm = inode->i_private;
+ struct debug_buffer *db;
+
+ db = debug_buffer_allocate();
+ if (!db)
+ goto bail;
+
+ db->len = debug_mle_print(dlm, db);
+
+ file->private_data = db;
+
+ return 0;
+bail:
+ return -ENOMEM;
+}
+
+static struct file_operations debug_mle_fops = {
+ .open = debug_mle_open,
+ .release = debug_buffer_release,
+ .read = debug_buffer_read,
+ .llseek = debug_buffer_llseek,
+};
+
+/* end - debug mle funcs */
+
+/* begin - debug lockres funcs */
+static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
+{
+ int out;
+
+#define DEBUG_LOCK_VERSION 1
+ spin_lock(&lock->spinlock);
+ out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+ "%d,%d,%d,%d\n",
+ DEBUG_LOCK_VERSION,
+ list_type, lock->ml.type, lock->ml.convert_type,
+ lock->ml.node,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ !list_empty(&lock->ast_list),
+ !list_empty(&lock->bast_list),
+ lock->ast_pending, lock->bast_pending,
+ lock->convert_pending, lock->lock_pending,
+ lock->cancel_pending, lock->unlock_pending,
+ atomic_read(&lock->lock_refs.refcount));
+ spin_unlock(&lock->spinlock);
+
+ return out;
+}
+
+static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
+{
+ struct dlm_lock *lock;
+ int i;
+ int out = 0;
+
+ out += snprintf(buf + out, len - out, "NAME:");
+ out += stringify_lockname(res->lockname.name, res->lockname.len,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+#define DEBUG_LRES_VERSION 1
+ out += snprintf(buf + out, len - out,
+ "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
+ DEBUG_LRES_VERSION,
+ res->owner, res->state, res->last_used,
+ !list_empty(&res->purge),
+ !list_empty(&res->dirty),
+ !list_empty(&res->recovering),
+ res->inflight_locks, res->migration_pending,
+ atomic_read(&res->asts_reserved),
+ atomic_read(&res->refs.refcount));
+
+ /* refmap */
+ out += snprintf(buf + out, len - out, "RMAP:");
+ out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* lvb */
+ out += snprintf(buf + out, len - out, "LVBX:");
+ for (i = 0; i < DLM_LVB_LEN; i++)
+ out += snprintf(buf + out, len - out,
+ "%02x", (unsigned char)res->lvb[i]);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* granted */
+ list_for_each_entry(lock, &res->granted, list)
+ out += dump_lock(lock, 0, buf + out, len - out);
+
+ /* converting */
+ list_for_each_entry(lock, &res->converting, list)
+ out += dump_lock(lock, 1, buf + out, len - out);
+
+ /* blocked */
+ list_for_each_entry(lock, &res->blocked, list)
+ out += dump_lock(lock, 2, buf + out, len - out);
+
+ out += snprintf(buf + out, len - out, "\n");
+
+ return out;
+}
+
+static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct debug_lockres *dl = m->private;
+ struct dlm_ctxt *dlm = dl->dl_ctxt;
+ struct dlm_lock_resource *res = NULL;
+
+ spin_lock(&dlm->spinlock);
+
+ if (dl->dl_res) {
+ list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
+ if (dl->dl_res) {
+ dlm_lockres_put(dl->dl_res);
+ dl->dl_res = NULL;
+ }
+ if (&res->tracking == &dlm->tracking_list) {
+ mlog(0, "End of list found, %p\n", res);
+ dl = NULL;
+ break;
+ }
+ dlm_lockres_get(res);
+ dl->dl_res = res;
+ break;
+ }
+ } else {
+ if (!list_empty(&dlm->tracking_list)) {
+ list_for_each_entry(res, &dlm->tracking_list, tracking)
+ break;
+ dlm_lockres_get(res);
+ dl->dl_res = res;
+ } else
+ dl = NULL;
+ }
+
+ if (dl) {
+ spin_lock(&dl->dl_res->spinlock);
+ dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
+ spin_unlock(&dl->dl_res->spinlock);
+ }
+
+ spin_unlock(&dlm->spinlock);
+
+ return dl;
+}
+
+static void lockres_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return NULL;
+}
+
+static int lockres_seq_show(struct seq_file *s, void *v)
+{
+ struct debug_lockres *dl = (struct debug_lockres *)v;
+
+ seq_printf(s, "%s", dl->dl_buf);
+
+ return 0;
+}
+
+static struct seq_operations debug_lockres_ops = {
+ .start = lockres_seq_start,
+ .stop = lockres_seq_stop,
+ .next = lockres_seq_next,
+ .show = lockres_seq_show,
+};
+
+static int debug_lockres_open(struct inode *inode, struct file *file)
+{
+ struct dlm_ctxt *dlm = inode->i_private;
+ int ret = -ENOMEM;
+ struct seq_file *seq;
+ struct debug_lockres *dl = NULL;
+
+ dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
+ if (!dl) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ dl->dl_len = PAGE_SIZE;
+ dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
+ if (!dl->dl_buf) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = seq_open(file, &debug_lockres_ops);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ seq = (struct seq_file *) file->private_data;
+ seq->private = dl;
+
+ dlm_grab(dlm);
+ dl->dl_ctxt = dlm;
+
+ return 0;
+bail:
+ if (dl)
+ kfree(dl->dl_buf);
+ kfree(dl);
+ return ret;
+}
+
+static int debug_lockres_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = (struct seq_file *)file->private_data;
+ struct debug_lockres *dl = (struct debug_lockres *)seq->private;
+
+ if (dl->dl_res)
+ dlm_lockres_put(dl->dl_res);
+ dlm_put(dl->dl_ctxt);
+ kfree(dl->dl_buf);
+ return seq_release_private(inode, file);
+}
+
+static struct file_operations debug_lockres_fops = {
+ .open = debug_lockres_open,
+ .release = debug_lockres_release,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+/* end - debug lockres funcs */
+
+/* begin - debug state funcs */
+static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+ int out = 0;
+ struct dlm_reco_node_data *node;
+ char *state;
+ int lres, rres, ures, tres;
+
+ lres = atomic_read(&dlm->local_resources);
+ rres = atomic_read(&dlm->remote_resources);
+ ures = atomic_read(&dlm->unknown_resources);
+ tres = lres + rres + ures;
+
+ spin_lock(&dlm->spinlock);
+
+ switch (dlm->dlm_state) {
+ case DLM_CTXT_NEW:
+ state = "NEW"; break;
+ case DLM_CTXT_JOINED:
+ state = "JOINED"; break;
+ case DLM_CTXT_IN_SHUTDOWN:
+ state = "SHUTDOWN"; break;
+ case DLM_CTXT_LEAVING:
+ state = "LEAVING"; break;
+ default:
+ state = "UNKNOWN"; break;
+ }
+
+ /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
+ out += snprintf(db->buf + out, db->len - out,
+ "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key);
+
+ /* Thread Pid: xxx Node: xxx State: xxxxx */
+ out += snprintf(db->buf + out, db->len - out,
+ "Thread Pid: %d Node: %d State: %s\n",
+ dlm->dlm_thread_task->pid, dlm->node_num, state);
+
+ /* Number of Joins: xxx Joining Node: xxx */
+ out += snprintf(db->buf + out, db->len - out,
+ "Number of Joins: %d Joining Node: %d\n",
+ dlm->num_joins, dlm->joining_node);
+
+ /* Domain Map: xx xx xx */
+ out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+ out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
+ db->buf + out, db->len - out);
+ out += snprintf(db->buf + out, db->len - out, "\n");
+
+ /* Live Map: xx xx xx */
+ out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+ out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
+ db->buf + out, db->len - out);
+ out += snprintf(db->buf + out, db->len - out, "\n");
+
+ /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */
+ out += snprintf(db->buf + out, db->len - out,
+ "Mastered Resources Total: %d Locally: %d "
+ "Remotely: %d Unknown: %d\n",
+ tres, lres, rres, ures);
+
+ /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
+ out += snprintf(db->buf + out, db->len - out,
+ "Lists: Dirty=%s Purge=%s PendingASTs=%s "
+ "PendingBASTs=%s Master=%s\n",
+ (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
+ (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
+ (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
+ (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
+ (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
+
+ /* Purge Count: xxx Refs: xxx */
+ out += snprintf(db->buf + out, db->len - out,
+ "Purge Count: %d Refs: %d\n", dlm->purge_count,
+ atomic_read(&dlm->dlm_refs.refcount));
+
+ /* Dead Node: xxx */
+ out += snprintf(db->buf + out, db->len - out,
+ "Dead Node: %d\n", dlm->reco.dead_node);
+
+ /* What about DLM_RECO_STATE_FINALIZE? */
+ if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
+ state = "ACTIVE";
+ else
+ state = "INACTIVE";
+
+ /* Recovery Pid: xxxx Master: xxx State: xxxx */
+ out += snprintf(db->buf + out, db->len - out,
+ "Recovery Pid: %d Master: %d State: %s\n",
+ dlm->dlm_reco_thread_task->pid,
+ dlm->reco.new_master, state);
+
+ /* Recovery Map: xx xx */
+ out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+ out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
+ db->buf + out, db->len - out);
+ out += snprintf(db->buf + out, db->len - out, "\n");
+
+ /* Recovery Node State: */
+ out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+ list_for_each_entry(node, &dlm->reco.node_data, list) {
+ switch (node->state) {
+ case DLM_RECO_NODE_DATA_INIT:
+ state = "INIT";
+ break;
+ case DLM_RECO_NODE_DATA_REQUESTING:
+ state = "REQUESTING";
+ break;
+ case DLM_RECO_NODE_DATA_DEAD:
+ state = "DEAD";
+ break;
+ case DLM_RECO_NODE_DATA_RECEIVING:
+ state = "RECEIVING";
+ break;
+ case DLM_RECO_NODE_DATA_REQUESTED:
+ state = "REQUESTED";
+ break;
+ case DLM_RECO_NODE_DATA_DONE:
+ state = "DONE";
+ break;
+ case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+ state = "FINALIZE-SENT";
+ break;
+ default:
+ state = "BAD";
+ break;
+ }
+ out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+ node->node_num, state);
+ }
+
+ spin_unlock(&dlm->spinlock);
+
+ return out;
+}
+
+static int debug_state_open(struct inode *inode, struct file *file)
+{
+ struct dlm_ctxt *dlm = inode->i_private;
+ struct debug_buffer *db = NULL;
+
+ db = debug_buffer_allocate();
+ if (!db)
+ goto bail;
+
+ db->len = debug_state_print(dlm, db);
+
+ file->private_data = db;
+
+ return 0;
+bail:
+ return -ENOMEM;
+}
+
+static struct file_operations debug_state_fops = {
+ .open = debug_state_open,
+ .release = debug_buffer_release,
+ .read = debug_buffer_read,
+ .llseek = debug_buffer_llseek,
+};
+/* end - debug state funcs */
+
+/* files in subroot */
+int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+ struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+ /* for dumping dlm_ctxt */
+ dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
+ S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot,
+ dlm, &debug_state_fops);
+ if (!dc->debug_state_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ /* for dumping lockres */
+ dc->debug_lockres_dentry =
+ debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
+ S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot,
+ dlm, &debug_lockres_fops);
+ if (!dc->debug_lockres_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ /* for dumping mles */
+ dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
+ S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot,
+ dlm, &debug_mle_fops);
+ if (!dc->debug_mle_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ /* for dumping lockres on the purge list */
+ dc->debug_purgelist_dentry =
+ debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
+ S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot,
+ dlm, &debug_purgelist_fops);
+ if (!dc->debug_purgelist_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ dlm_debug_get(dc);
+ return 0;
+
+bail:
+ dlm_debug_shutdown(dlm);
+ return -ENOMEM;
+}
+
+void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+ struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+ if (dc) {
+ if (dc->debug_purgelist_dentry)
+ debugfs_remove(dc->debug_purgelist_dentry);
+ if (dc->debug_mle_dentry)
+ debugfs_remove(dc->debug_mle_dentry);
+ if (dc->debug_lockres_dentry)
+ debugfs_remove(dc->debug_lockres_dentry);
+ if (dc->debug_state_dentry)
+ debugfs_remove(dc->debug_state_dentry);
+ dlm_debug_put(dc);
+ }
+}
+
+/* subroot - domain dir */
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+ dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
+ dlm_debugfs_root);
+ if (!dlm->dlm_debugfs_subroot) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
+ GFP_KERNEL);
+ if (!dlm->dlm_debug_ctxt) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+ kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
+
+ return 0;
+bail:
+ dlm_destroy_debugfs_subroot(dlm);
+ return -ENOMEM;
+}
+
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+ if (dlm->dlm_debugfs_subroot)
+ debugfs_remove(dlm->dlm_debugfs_subroot);
+}
+
+/* debugfs root */
+int dlm_create_debugfs_root(void)
+{
+ dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
+ if (!dlm_debugfs_root) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void dlm_destroy_debugfs_root(void)
+{
+ if (dlm_debugfs_root)
+ debugfs_remove(dlm_debugfs_root);
+}
+#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 00000000000..d34a62a3a62
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,86 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle);
+
+#ifdef CONFIG_DEBUG_FS
+
+struct dlm_debug_ctxt {
+ struct kref debug_refcnt;
+ struct dentry *debug_state_dentry;
+ struct dentry *debug_lockres_dentry;
+ struct dentry *debug_mle_dentry;
+ struct dentry *debug_purgelist_dentry;
+};
+
+struct debug_buffer {
+ int len;
+ char *buf;
+};
+
+struct debug_lockres {
+ int dl_len;
+ char *dl_buf;
+ struct dlm_ctxt *dl_ctxt;
+ struct dlm_lock_resource *dl_res;
+};
+
+int dlm_debug_init(struct dlm_ctxt *dlm);
+void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_root(void);
+void dlm_destroy_debugfs_root(void);
+
+#else
+
+static int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+ return 0;
+}
+static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+}
+static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+ return 0;
+}
+static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+}
+static int dlm_create_debugfs_root(void)
+{
+ return 0;
+}
+static void dlm_destroy_debugfs_root(void)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+#endif /* DLMDEBUG_H */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0879d86113e..63f8125824e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -33,6 +33,7 @@
#include <linux/spinlock.h>
#include <linux/delay.h>
#include <linux/err.h>
+#include <linux/debugfs.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
@@ -40,8 +41,8 @@
#include "dlmapi.h"
#include "dlmcommon.h"
-
#include "dlmdomain.h"
+#include "dlmdebug.h"
#include "dlmver.h"
@@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
{
+ dlm_destroy_debugfs_subroot(dlm);
+
if (dlm->lockres_hash)
dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
@@ -395,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
{
dlm_unregister_domain_handlers(dlm);
+ dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
@@ -644,6 +648,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm)
void dlm_unregister_domain(struct dlm_ctxt *dlm)
{
int leave = 0;
+ struct dlm_lock_resource *res;
spin_lock(&dlm_domain_lock);
BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
@@ -673,6 +678,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
msleep(500);
mlog(0, "%s: more migration to do\n", dlm->name);
}
+
+ /* This list should be empty. If not, print remaining lockres */
+ if (!list_empty(&dlm->tracking_list)) {
+ mlog(ML_ERROR, "Following lockres' are still on the "
+ "tracking list:\n");
+ list_for_each_entry(res, &dlm->tracking_list, tracking)
+ dlm_print_one_lock_resource(res);
+ }
+
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
dlm_complete_dlm_shutdown(dlm);
@@ -1405,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
+ status = dlm_debug_init(dlm);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
status = dlm_launch_thread(dlm);
if (status < 0) {
mlog_errno(status);
@@ -1472,6 +1492,7 @@ bail:
if (status) {
dlm_unregister_domain_handlers(dlm);
+ dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
@@ -1484,6 +1505,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
u32 key)
{
int i;
+ int ret;
struct dlm_ctxt *dlm = NULL;
dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
@@ -1516,6 +1538,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->key = key;
dlm->node_num = o2nm_this_node();
+ ret = dlm_create_debugfs_subroot(dlm);
+ if (ret < 0) {
+ dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ goto leave;
+ }
+
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
spin_lock_init(&dlm->ast_lock);
@@ -1526,6 +1557,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
INIT_LIST_HEAD(&dlm->reco.node_data);
INIT_LIST_HEAD(&dlm->purge_list);
INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+ INIT_LIST_HEAD(&dlm->tracking_list);
dlm->reco.state = 0;
INIT_LIST_HEAD(&dlm->pending_asts);
@@ -1816,21 +1848,49 @@ static int __init dlm_init(void)
dlm_print_version();
status = dlm_init_mle_cache();
- if (status)
- return -1;
+ if (status) {
+ mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
+ goto error;
+ }
+
+ status = dlm_init_master_caches();
+ if (status) {
+ mlog(ML_ERROR, "Could not create o2dlm_lockres and "
+ "o2dlm_lockname slabcaches\n");
+ goto error;
+ }
+
+ status = dlm_init_lock_cache();
+ if (status) {
+ mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
+ goto error;
+ }
status = dlm_register_net_handlers();
if (status) {
- dlm_destroy_mle_cache();
- return -1;
+ mlog(ML_ERROR, "Unable to register network handlers\n");
+ goto error;
}
+ status = dlm_create_debugfs_root();
+ if (status)
+ goto error;
+
return 0;
+error:
+ dlm_unregister_net_handlers();
+ dlm_destroy_lock_cache();
+ dlm_destroy_master_caches();
+ dlm_destroy_mle_cache();
+ return -1;
}
static void __exit dlm_exit (void)
{
+ dlm_destroy_debugfs_root();
dlm_unregister_net_handlers();
+ dlm_destroy_lock_cache();
+ dlm_destroy_master_caches();
dlm_destroy_mle_cache();
}
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 52578d907d9..83a9f2972ac 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,6 +53,8 @@
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
+static struct kmem_cache *dlm_lock_cache = NULL;
+
static DEFINE_SPINLOCK(dlm_cookie_lock);
static u64 dlm_next_cookie = 1;
@@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
static void dlm_lock_release(struct kref *kref);
static void dlm_lock_detach_lockres(struct dlm_lock *lock);
+int dlm_init_lock_cache(void)
+{
+ dlm_lock_cache = kmem_cache_create("o2dlm_lock",
+ sizeof(struct dlm_lock),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (dlm_lock_cache == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void dlm_destroy_lock_cache(void)
+{
+ if (dlm_lock_cache)
+ kmem_cache_destroy(dlm_lock_cache);
+}
+
/* Tell us whether we can grant a new lock request.
* locking:
* caller needs: res->spinlock
@@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref)
mlog(0, "freeing kernel-allocated lksb\n");
kfree(lock->lksb);
}
- kfree(lock);
+ kmem_cache_free(dlm_lock_cache, lock);
}
/* associate a lock with it's lockres, getting a ref on the lockres */
@@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
struct dlm_lock *lock;
int kernel_allocated = 0;
- lock = kzalloc(sizeof(*lock), GFP_NOFS);
+ lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
if (!lock)
return NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ea6b8957786..efc015c6128 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -48,47 +48,11 @@
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
+#include "dlmdebug.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
#include "cluster/masklog.h"
-enum dlm_mle_type {
- DLM_MLE_BLOCK,
- DLM_MLE_MASTER,
- DLM_MLE_MIGRATION
-};
-
-struct dlm_lock_name
-{
- u8 len;
- u8 name[DLM_LOCKID_NAME_MAX];
-};
-
-struct dlm_master_list_entry
-{
- struct list_head list;
- struct list_head hb_events;
- struct dlm_ctxt *dlm;
- spinlock_t spinlock;
- wait_queue_head_t wq;
- atomic_t woken;
- struct kref mle_refs;
- int inuse;
- unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- u8 master;
- u8 new_master;
- enum dlm_mle_type type;
- struct o2hb_callback_func mle_hb_up;
- struct o2hb_callback_func mle_hb_down;
- union {
- struct dlm_lock_resource *res;
- struct dlm_lock_name name;
- } u;
-};
-
static void dlm_mle_node_down(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
struct o2nm_node *node,
@@ -128,98 +92,10 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
return 1;
}
-#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m)
-static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
-{
- int i;
- printk("%s=[ ", mapname);
- for (i=0; i<O2NM_MAX_NODES; i++)
- if (test_bit(i, map))
- printk("%d ", i);
- printk("]");
-}
-
-static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
-{
- int refs;
- char *type;
- char attached;
- u8 master;
- unsigned int namelen;
- const char *name;
- struct kref *k;
- unsigned long *maybe = mle->maybe_map,
- *vote = mle->vote_map,
- *resp = mle->response_map,
- *node = mle->node_map;
-
- k = &mle->mle_refs;
- if (mle->type == DLM_MLE_BLOCK)
- type = "BLK";
- else if (mle->type == DLM_MLE_MASTER)
- type = "MAS";
- else
- type = "MIG";
- refs = atomic_read(&k->refcount);
- master = mle->master;
- attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
-
- if (mle->type != DLM_MLE_MASTER) {
- namelen = mle->u.name.len;
- name = mle->u.name.name;
- } else {
- namelen = mle->u.res->lockname.len;
- name = mle->u.res->lockname.name;
- }
-
- mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
- namelen, name, type, refs, master, mle->new_master, attached,
- mle->inuse);
- dlm_print_nodemap(maybe);
- printk(", ");
- dlm_print_nodemap(vote);
- printk(", ");
- dlm_print_nodemap(resp);
- printk(", ");
- dlm_print_nodemap(node);
- printk(", ");
- printk("\n");
-}
-
-#if 0
-/* Code here is included but defined out as it aids debugging */
-
-static void dlm_dump_mles(struct dlm_ctxt *dlm)
-{
- struct dlm_master_list_entry *mle;
-
- mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
- spin_lock(&dlm->master_lock);
- list_for_each_entry(mle, &dlm->master_list, list)
- dlm_print_one_mle(mle);
- spin_unlock(&dlm->master_lock);
-}
-
-int dlm_dump_all_mles(const char __user *data, unsigned int len)
-{
- struct dlm_ctxt *dlm;
-
- spin_lock(&dlm_domain_lock);
- list_for_each_entry(dlm, &dlm_domains, list) {
- mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
- dlm_dump_mles(dlm);
- }
- spin_unlock(&dlm_domain_lock);
- return len;
-}
-EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
-
-#endif /* 0 */
-
-
+static struct kmem_cache *dlm_lockres_cache = NULL;
+static struct kmem_cache *dlm_lockname_cache = NULL;
static struct kmem_cache *dlm_mle_cache = NULL;
-
static void dlm_mle_release(struct kref *kref);
static void dlm_init_mle(struct dlm_master_list_entry *mle,
enum dlm_mle_type type,
@@ -507,7 +383,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
int dlm_init_mle_cache(void)
{
- dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
+ dlm_mle_cache = kmem_cache_create("o2dlm_mle",
sizeof(struct dlm_master_list_entry),
0, SLAB_HWCACHE_ALIGN,
NULL);
@@ -560,6 +436,35 @@ static void dlm_mle_release(struct kref *kref)
* LOCK RESOURCE FUNCTIONS
*/
+int dlm_init_master_caches(void)
+{
+ dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
+ sizeof(struct dlm_lock_resource),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!dlm_lockres_cache)
+ goto bail;
+
+ dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
+ DLM_LOCKID_NAME_MAX, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!dlm_lockname_cache)
+ goto bail;
+
+ return 0;
+bail:
+ dlm_destroy_master_caches();
+ return -ENOMEM;
+}
+
+void dlm_destroy_master_caches(void)
+{
+ if (dlm_lockname_cache)
+ kmem_cache_destroy(dlm_lockname_cache);
+
+ if (dlm_lockres_cache)
+ kmem_cache_destroy(dlm_lockres_cache);
+}
+
static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
u8 owner)
@@ -610,6 +515,14 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ dlm_print_one_lock_resource(res);
+ }
+
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
!list_empty(&res->converting) ||
@@ -642,9 +555,9 @@ static void dlm_lockres_release(struct kref *kref)
BUG_ON(!list_empty(&res->recovering));
BUG_ON(!list_empty(&res->purge));
- kfree(res->lockname.name);
+ kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
- kfree(res);
+ kmem_cache_free(dlm_lockres_cache, res);
}
void dlm_lockres_put(struct dlm_lock_resource *res)
@@ -677,6 +590,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
INIT_LIST_HEAD(&res->dirty);
INIT_LIST_HEAD(&res->recovering);
INIT_LIST_HEAD(&res->purge);
+ INIT_LIST_HEAD(&res->tracking);
atomic_set(&res->asts_reserved, 0);
res->migration_pending = 0;
res->inflight_locks = 0;
@@ -692,6 +606,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->last_used = 0;
+ list_add_tail(&res->tracking, &dlm->tracking_list);
+
memset(res->lvb, 0, DLM_LVB_LEN);
memset(res->refmap, 0, sizeof(res->refmap));
}
@@ -700,20 +616,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int namelen)
{
- struct dlm_lock_resource *res;
+ struct dlm_lock_resource *res = NULL;
- res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
+ res = (struct dlm_lock_resource *)
+ kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
if (!res)
- return NULL;
+ goto error;
- res->lockname.name = kmalloc(namelen, GFP_NOFS);
- if (!res->lockname.name) {
- kfree(res);
- return NULL;
- }
+ res->lockname.name = (char *)
+ kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+ if (!res->lockname.name)
+ goto error;
dlm_init_lockres(dlm, res, name, namelen);
return res;
+
+error:
+ if (res && res->lockname.name)
+ kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
+
+ if (res)
+ kmem_cache_free(dlm_lockres_cache, res);
+ return NULL;
}
void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1f1873bf41f..394d25a131a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -27,18 +27,11 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/mm.h>
-#include <linux/crc32.h>
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-
-#include <dlm/dlmapi.h>
-
#define MLOG_MASK_PREFIX ML_DLM_GLUE
#include <cluster/masklog.h>
@@ -53,6 +46,7 @@
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
+#include "stackglue.h"
#include "slot_map.h"
#include "super.h"
#include "uptodate.h"
@@ -113,7 +107,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
unsigned int line,
struct ocfs2_lock_res *lockres)
{
- struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ struct ocfs2_meta_lvb *lvb =
+ (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
mlog(level, "LVB information for %s (called from %s:%u):\n",
lockres->l_name, function, line);
@@ -259,31 +254,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
.flags = 0,
};
-/*
- * This is the filesystem locking protocol version.
- *
- * Whenever the filesystem does new things with locks (adds or removes a
- * lock, orders them differently, does different things underneath a lock),
- * the version must be changed. The protocol is negotiated when joining
- * the dlm domain. A node may join the domain if its major version is
- * identical to all other nodes and its minor version is greater than
- * or equal to all other nodes. When its minor version is greater than
- * the other nodes, it will run at the minor version specified by the
- * other nodes.
- *
- * If a locking change is made that will not be compatible with older
- * versions, the major number must be increased and the minor version set
- * to zero. If a change merely adds a behavior that can be disabled when
- * speaking to older versions, the minor version must be increased. If a
- * change adds a fully backwards compatible change (eg, LVB changes that
- * are just ignored by older versions), the version does not need to be
- * updated.
- */
-const struct dlm_protocol_version ocfs2_locking_protocol = {
- .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
- .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-};
-
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -316,7 +286,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l
static int ocfs2_lock_create(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int level,
- int dlm_flags);
+ u32 dlm_flags);
static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
int wanted);
static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
@@ -330,10 +300,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
int convert);
-#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \
- mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
- "resource %s: %s\n", dlm_errname(_stat), _func, \
- _lockres->l_name, dlm_errmsg(_stat)); \
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
+ mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
+ _err, _func, _lockres->l_name); \
} while (0)
static int ocfs2_downconvert_thread(void *arg);
static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -342,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode,
struct buffer_head **bh);
static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
static inline int ocfs2_highest_compat_lock_level(int level);
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
- int new_level);
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level);
static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int new_level,
- int lvb);
+ int lvb,
+ unsigned int generation);
static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
static int ocfs2_cancel_convert(struct ocfs2_super *osb,
@@ -406,9 +376,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
res->l_ops = ops;
res->l_priv = priv;
- res->l_level = LKM_IVMODE;
- res->l_requested = LKM_IVMODE;
- res->l_blocking = LKM_IVMODE;
+ res->l_level = DLM_LOCK_IV;
+ res->l_requested = DLM_LOCK_IV;
+ res->l_blocking = DLM_LOCK_IV;
res->l_action = OCFS2_AST_INVALID;
res->l_unlock_action = OCFS2_UNLOCK_INVALID;
@@ -604,10 +574,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
BUG_ON(!lockres);
switch(level) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
lockres->l_ex_holders++;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
lockres->l_ro_holders++;
break;
default:
@@ -625,11 +595,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
BUG_ON(!lockres);
switch(level) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
BUG_ON(!lockres->l_ex_holders);
lockres->l_ex_holders--;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
BUG_ON(!lockres->l_ro_holders);
lockres->l_ro_holders--;
break;
@@ -644,12 +614,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
* lock types are added. */
static inline int ocfs2_highest_compat_lock_level(int level)
{
- int new_level = LKM_EXMODE;
+ int new_level = DLM_LOCK_EX;
- if (level == LKM_EXMODE)
- new_level = LKM_NLMODE;
- else if (level == LKM_PRMODE)
- new_level = LKM_PRMODE;
+ if (level == DLM_LOCK_EX)
+ new_level = DLM_LOCK_NL;
+ else if (level == DLM_LOCK_PR)
+ new_level = DLM_LOCK_PR;
return new_level;
}
@@ -688,12 +658,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
- BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+ BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
lockres->l_level = lockres->l_requested;
if (lockres->l_level <=
ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
- lockres->l_blocking = LKM_NLMODE;
+ lockres->l_blocking = DLM_LOCK_NL;
lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
}
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
@@ -712,7 +682,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
* information is already up to data. Convert from NL to
* *anything* however should mark ourselves as needing an
* update */
- if (lockres->l_level == LKM_NLMODE &&
+ if (lockres->l_level == DLM_LOCK_NL &&
lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -729,7 +699,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
- if (lockres->l_requested > LKM_NLMODE &&
+ if (lockres->l_requested > DLM_LOCK_NL &&
!(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -767,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
return needs_downconvert;
}
+/*
+ * OCFS2_LOCK_PENDING and l_pending_gen.
+ *
+ * Why does OCFS2_LOCK_PENDING exist? To close a race between setting
+ * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock()
+ * for more details on the race.
+ *
+ * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces
+ * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock()
+ * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear
+ * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns,
+ * the caller is going to try to clear PENDING again. If nothing else is
+ * happening, __lockres_clear_pending() sees PENDING is unset and does
+ * nothing.
+ *
+ * But what if another path (eg downconvert thread) has just started a
+ * new locking action? The other path has re-set PENDING. Our path
+ * cannot clear PENDING, because that will re-open the original race
+ * window.
+ *
+ * [Example]
+ *
+ * ocfs2_meta_lock()
+ * ocfs2_cluster_lock()
+ * set BUSY
+ * set PENDING
+ * drop l_lock
+ * ocfs2_dlm_lock()
+ * ocfs2_locking_ast() ocfs2_downconvert_thread()
+ * clear PENDING ocfs2_unblock_lock()
+ * take_l_lock
+ * !BUSY
+ * ocfs2_prepare_downconvert()
+ * set BUSY
+ * set PENDING
+ * drop l_lock
+ * take l_lock
+ * clear PENDING
+ * drop l_lock
+ * <window>
+ * ocfs2_dlm_lock()
+ *
+ * So as you can see, we now have a window where l_lock is not held,
+ * PENDING is not set, and ocfs2_dlm_lock() has not been called.
+ *
+ * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
+ * set by ocfs2_prepare_downconvert(). That wasn't nice.
+ *
+ * To solve this we introduce l_pending_gen. A call to
+ * lockres_clear_pending() will only do so when it is passed a generation
+ * number that matches the lockres. lockres_set_pending() will return the
+ * current generation number. When ocfs2_cluster_lock() goes to clear
+ * PENDING, it passes the generation it got from set_pending(). In our
+ * example above, the generation numbers will *not* match. Thus,
+ * ocfs2_cluster_lock() will not clear the PENDING set by
+ * ocfs2_prepare_downconvert().
+ */
+
+/* Unlocked version for ocfs2_locking_ast() */
+static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
+ unsigned int generation,
+ struct ocfs2_super *osb)
+{
+ assert_spin_locked(&lockres->l_lock);
+
+ /*
+ * The ast and locking functions can race us here. The winner
+ * will clear pending, the loser will not.
+ */
+ if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
+ (lockres->l_pending_gen != generation))
+ return;
+
+ lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
+ lockres->l_pending_gen++;
+
+ /*
+ * The downconvert thread may have skipped us because we
+ * were PENDING. Wake it up.
+ */
+ if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+ ocfs2_wake_downconvert_thread(osb);
+}
+
+/* Locked version for callers of ocfs2_dlm_lock() */
+static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
+ unsigned int generation,
+ struct ocfs2_super *osb)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ __lockres_clear_pending(lockres, generation, osb);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
+{
+ assert_spin_locked(&lockres->l_lock);
+ BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+
+ lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
+
+ return lockres->l_pending_gen;
+}
+
+
static void ocfs2_blocking_ast(void *opaque, int level)
{
struct ocfs2_lock_res *lockres = opaque;
@@ -774,7 +851,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
int needs_downconvert;
unsigned long flags;
- BUG_ON(level <= LKM_NLMODE);
+ BUG_ON(level <= DLM_LOCK_NL);
mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
lockres->l_name, level, lockres->l_level,
@@ -801,14 +878,22 @@ static void ocfs2_blocking_ast(void *opaque, int level)
static void ocfs2_locking_ast(void *opaque)
{
struct ocfs2_lock_res *lockres = opaque;
- struct dlm_lockstatus *lksb = &lockres->l_lksb;
+ struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
unsigned long flags;
+ int status;
spin_lock_irqsave(&lockres->l_lock, flags);
- if (lksb->status != DLM_NORMAL) {
- mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
- lockres->l_name, lksb->status);
+ status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+
+ if (status == -EAGAIN) {
+ lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+ goto out;
+ }
+
+ if (status) {
+ mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
+ lockres->l_name, status);
spin_unlock_irqrestore(&lockres->l_lock, flags);
return;
}
@@ -831,11 +916,23 @@ static void ocfs2_locking_ast(void *opaque)
lockres->l_unlock_action);
BUG();
}
-
+out:
/* set it to something invalid so if we get called again we
* can catch it. */
lockres->l_action = OCFS2_AST_INVALID;
+ /* Did we try to cancel this lock? Clear that state */
+ if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
+ lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+ /*
+ * We may have beaten the locking functions here. We certainly
+ * know that dlm_lock() has been called :-)
+ * Because we can't have two lock calls in flight at once, we
+ * can use lockres->l_pending_gen.
+ */
+ __lockres_clear_pending(lockres, lockres->l_pending_gen, osb);
+
wake_up(&lockres->l_event);
spin_unlock_irqrestore(&lockres->l_lock, flags);
}
@@ -865,15 +962,15 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
static int ocfs2_lock_create(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int level,
- int dlm_flags)
+ u32 dlm_flags)
{
int ret = 0;
- enum dlm_status status = DLM_NORMAL;
unsigned long flags;
+ unsigned int gen;
mlog_entry_void();
- mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+ mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
dlm_flags);
spin_lock_irqsave(&lockres->l_lock, flags);
@@ -886,24 +983,23 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
lockres->l_action = OCFS2_AST_ATTACH;
lockres->l_requested = level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+ gen = lockres_set_pending(lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- status = dlmlock(osb->dlm,
- level,
- &lockres->l_lksb,
- dlm_flags,
- lockres->l_name,
- OCFS2_LOCK_ID_MAX_LEN - 1,
- ocfs2_locking_ast,
- lockres,
- ocfs2_blocking_ast);
- if (status != DLM_NORMAL) {
- ocfs2_log_dlm_error("dlmlock", status, lockres);
- ret = -EINVAL;
+ ret = ocfs2_dlm_lock(osb->cconn,
+ level,
+ &lockres->l_lksb,
+ dlm_flags,
+ lockres->l_name,
+ OCFS2_LOCK_ID_MAX_LEN - 1,
+ lockres);
+ lockres_clear_pending(lockres, gen, osb);
+ if (ret) {
+ ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
ocfs2_recover_from_dlm_error(lockres, 1);
}
- mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+ mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
bail:
mlog_exit(ret);
@@ -1016,21 +1112,22 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
static int ocfs2_cluster_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int level,
- int lkm_flags,
+ u32 lkm_flags,
int arg_flags)
{
struct ocfs2_mask_waiter mw;
- enum dlm_status status;
int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
unsigned long flags;
+ unsigned int gen;
+ int noqueue_attempted = 0;
mlog_entry_void();
ocfs2_init_mask_waiter(&mw);
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
- lkm_flags |= LKM_VALBLK;
+ lkm_flags |= DLM_LKF_VALBLK;
again:
wait = 0;
@@ -1068,52 +1165,56 @@ again:
}
if (level > lockres->l_level) {
+ if (noqueue_attempted > 0) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+ if (lkm_flags & DLM_LKF_NOQUEUE)
+ noqueue_attempted = 1;
+
if (lockres->l_action != OCFS2_AST_INVALID)
mlog(ML_ERROR, "lockres %s has action %u pending\n",
lockres->l_name, lockres->l_action);
if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
lockres->l_action = OCFS2_AST_ATTACH;
- lkm_flags &= ~LKM_CONVERT;
+ lkm_flags &= ~DLM_LKF_CONVERT;
} else {
lockres->l_action = OCFS2_AST_CONVERT;
- lkm_flags |= LKM_CONVERT;
+ lkm_flags |= DLM_LKF_CONVERT;
}
lockres->l_requested = level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+ gen = lockres_set_pending(lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- BUG_ON(level == LKM_IVMODE);
- BUG_ON(level == LKM_NLMODE);
+ BUG_ON(level == DLM_LOCK_IV);
+ BUG_ON(level == DLM_LOCK_NL);
mlog(0, "lock %s, convert from %d to level = %d\n",
lockres->l_name, lockres->l_level, level);
/* call dlm_lock to upgrade lock now */
- status = dlmlock(osb->dlm,
- level,
- &lockres->l_lksb,
- lkm_flags,
- lockres->l_name,
- OCFS2_LOCK_ID_MAX_LEN - 1,
- ocfs2_locking_ast,
- lockres,
- ocfs2_blocking_ast);
- if (status != DLM_NORMAL) {
- if ((lkm_flags & LKM_NOQUEUE) &&
- (status == DLM_NOTQUEUED))
- ret = -EAGAIN;
- else {
- ocfs2_log_dlm_error("dlmlock", status,
- lockres);
- ret = -EINVAL;
+ ret = ocfs2_dlm_lock(osb->cconn,
+ level,
+ &lockres->l_lksb,
+ lkm_flags,
+ lockres->l_name,
+ OCFS2_LOCK_ID_MAX_LEN - 1,
+ lockres);
+ lockres_clear_pending(lockres, gen, osb);
+ if (ret) {
+ if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
+ (ret != -EAGAIN)) {
+ ocfs2_log_dlm_error("ocfs2_dlm_lock",
+ ret, lockres);
}
ocfs2_recover_from_dlm_error(lockres, 1);
goto out;
}
- mlog(0, "lock %s, successfull return from dlmlock\n",
+ mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
lockres->l_name);
/* At this point we've gone inside the dlm and need to
@@ -1177,9 +1278,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb,
int ex,
int local)
{
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
unsigned long flags;
- int lkm_flags = local ? LKM_LOCAL : 0;
+ u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
spin_lock_irqsave(&lockres->l_lock, flags);
BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -1222,7 +1323,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
}
/*
- * We don't want to use LKM_LOCAL on a meta data lock as they
+ * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
* don't use a generation in their lock names.
*/
ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
@@ -1261,7 +1362,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
lockres = &OCFS2_I(inode)->ip_rw_lockres;
- level = write ? LKM_EXMODE : LKM_PRMODE;
+ level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
0);
@@ -1274,7 +1375,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
void ocfs2_rw_unlock(struct inode *inode, int write)
{
- int level = write ? LKM_EXMODE : LKM_PRMODE;
+ int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1312,7 +1413,7 @@ int ocfs2_open_lock(struct inode *inode)
lockres = &OCFS2_I(inode)->ip_open_lockres;
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- LKM_PRMODE, 0, 0);
+ DLM_LOCK_PR, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -1340,16 +1441,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
lockres = &OCFS2_I(inode)->ip_open_lockres;
- level = write ? LKM_EXMODE : LKM_PRMODE;
+ level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
/*
* The file system may already holding a PRMODE/EXMODE open lock.
- * Since we pass LKM_NOQUEUE, the request won't block waiting on
+ * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
* other nodes and the -EAGAIN will indicate to the caller that
* this inode is still in use.
*/
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- level, LKM_NOQUEUE, 0);
+ level, DLM_LKF_NOQUEUE, 0);
out:
mlog_exit(status);
@@ -1374,10 +1475,10 @@ void ocfs2_open_unlock(struct inode *inode)
if(lockres->l_ro_holders)
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- LKM_PRMODE);
+ DLM_LOCK_PR);
if(lockres->l_ex_holders)
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- LKM_EXMODE);
+ DLM_LOCK_EX);
out:
mlog_exit_void();
@@ -1464,7 +1565,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
ocfs2_init_mask_waiter(&mw);
if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
- (lockres->l_level > LKM_NLMODE)) {
+ (lockres->l_level > DLM_LOCK_NL)) {
mlog(ML_ERROR,
"File lock \"%s\" has busy or locked state: flags: 0x%lx, "
"level: %u\n", lockres->l_name, lockres->l_flags,
@@ -1503,14 +1604,12 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
- lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
- ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
- if (ret != DLM_NORMAL) {
- if (trylock && ret == DLM_NOTQUEUED)
- ret = -EAGAIN;
- else {
- ocfs2_log_dlm_error("dlmlock", ret, lockres);
+ ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
+ lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+ lockres);
+ if (ret) {
+ if (!trylock || (ret != -EAGAIN)) {
+ ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
ret = -EINVAL;
}
@@ -1537,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
* to just bubble sucess back up to the user.
*/
ret = ocfs2_flock_handle_signal(lockres, level);
+ } else if (!ret && (level > lockres->l_level)) {
+ /* Trylock failed asynchronously */
+ BUG_ON(!trylock);
+ ret = -EAGAIN;
}
out:
@@ -1549,6 +1652,7 @@ out:
void ocfs2_file_unlock(struct file *file)
{
int ret;
+ unsigned int gen;
unsigned long flags;
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1572,13 +1676,13 @@ void ocfs2_file_unlock(struct file *file)
* Fake a blocking ast for the downconvert code.
*/
lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
- lockres->l_blocking = LKM_EXMODE;
+ lockres->l_blocking = DLM_LOCK_EX;
- ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+ gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+ ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
if (ret) {
mlog_errno(ret);
return;
@@ -1601,11 +1705,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
* condition. */
if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
switch(lockres->l_blocking) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
if (!lockres->l_ex_holders && !lockres->l_ro_holders)
kick = 1;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
if (!lockres->l_ex_holders)
kick = 1;
break;
@@ -1648,7 +1752,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
mlog_entry_void();
- lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
/*
* Invalidate the LVB of a deleted inode - this way other
@@ -1700,7 +1804,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
mlog_meta_lvb(0, lockres);
- lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
/* We're safe here without the lockres lock... */
spin_lock(&oi->ip_lock);
@@ -1735,7 +1839,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
struct ocfs2_lock_res *lockres)
{
- struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ struct ocfs2_meta_lvb *lvb =
+ (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
if (lvb->lvb_version == OCFS2_LVB_VERSION
&& be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -1923,7 +2028,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
int ex,
int arg_flags)
{
- int status, level, dlm_flags, acquired;
+ int status, level, acquired;
+ u32 dlm_flags;
struct ocfs2_lock_res *lockres = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *local_bh = NULL;
@@ -1950,14 +2056,13 @@ int ocfs2_inode_lock_full(struct inode *inode,
goto local;
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
- wait_event(osb->recovery_event,
- ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+ ocfs2_wait_for_recovery(osb);
lockres = &OCFS2_I(inode)->ip_inode_lockres;
- level = ex ? LKM_EXMODE : LKM_PRMODE;
+ level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
dlm_flags = 0;
if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
- dlm_flags |= LKM_NOQUEUE;
+ dlm_flags |= DLM_LKF_NOQUEUE;
status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
if (status < 0) {
@@ -1974,8 +2079,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
* committed to owning this lock so we don't allow signals to
* abort the operation. */
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
- wait_event(osb->recovery_event,
- ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+ ocfs2_wait_for_recovery(osb);
local:
/*
@@ -2109,7 +2213,7 @@ int ocfs2_inode_lock_atime(struct inode *inode,
void ocfs2_inode_unlock(struct inode *inode,
int ex)
{
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2130,10 +2234,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
int ex)
{
int status = 0;
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
- struct buffer_head *bh;
- struct ocfs2_slot_info *si = osb->slot_info;
mlog_entry_void();
@@ -2159,11 +2261,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
goto bail;
}
if (status) {
- bh = si->si_bh;
- status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
- si->si_inode);
- if (status == 0)
- ocfs2_update_slot_info(si);
+ status = ocfs2_refresh_slot_info(osb);
ocfs2_complete_lock_res_refresh(lockres, status);
@@ -2178,7 +2276,7 @@ bail:
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex)
{
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
if (!ocfs2_mount_local(osb))
@@ -2196,7 +2294,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb)
if (ocfs2_mount_local(osb))
return 0;
- status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -2208,13 +2306,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
if (!ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
}
int ocfs2_dentry_lock(struct dentry *dentry, int ex)
{
int ret;
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
@@ -2235,7 +2333,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
{
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
@@ -2400,7 +2498,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
lockres->l_blocking);
/* Dump the raw LVB */
- lvb = lockres->l_lksb.lvb;
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
for(i = 0; i < DLM_LVB_LEN; i++)
seq_printf(m, "0x%x\t", lvb[i]);
@@ -2504,13 +2602,14 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
int ocfs2_dlm_init(struct ocfs2_super *osb)
{
int status = 0;
- u32 dlm_key;
- struct dlm_ctxt *dlm = NULL;
+ struct ocfs2_cluster_connection *conn = NULL;
mlog_entry_void();
- if (ocfs2_mount_local(osb))
+ if (ocfs2_mount_local(osb)) {
+ osb->node_num = 0;
goto local;
+ }
status = ocfs2_dlm_init_debug(osb);
if (status < 0) {
@@ -2527,26 +2626,31 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto bail;
}
- /* used by the dlm code to make message headers unique, each
- * node in this domain must agree on this. */
- dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
-
/* for now, uuid == domain */
- dlm = dlm_register_domain(osb->uuid_str, dlm_key,
- &osb->osb_locking_proto);
- if (IS_ERR(dlm)) {
- status = PTR_ERR(dlm);
+ status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+ osb->uuid_str,
+ strlen(osb->uuid_str),
+ ocfs2_do_node_down, osb,
+ &conn);
+ if (status) {
mlog_errno(status);
goto bail;
}
- dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
+ status = ocfs2_cluster_this_node(&osb->node_num);
+ if (status < 0) {
+ mlog_errno(status);
+ mlog(ML_ERROR,
+ "could not find this host's node number\n");
+ ocfs2_cluster_disconnect(conn, 0);
+ goto bail;
+ }
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
- osb->dlm = dlm;
+ osb->cconn = conn;
status = 0;
bail:
@@ -2560,14 +2664,19 @@ bail:
return status;
}
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
+ int hangup_pending)
{
mlog_entry_void();
- dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
-
ocfs2_drop_osb_locks(osb);
+ /*
+ * Now that we have dropped all locks and ocfs2_dismount_volume()
+ * has disabled recovery, the DLM won't be talking to us. It's
+ * safe to tear things down before disconnecting the cluster.
+ */
+
if (osb->dc_task) {
kthread_stop(osb->dc_task);
osb->dc_task = NULL;
@@ -2576,15 +2685,15 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
- dlm_unregister_domain(osb->dlm);
- osb->dlm = NULL;
+ ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+ osb->cconn = NULL;
ocfs2_dlm_shutdown_debug(osb);
mlog_exit_void();
}
-static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
+static void ocfs2_unlock_ast(void *opaque, int error)
{
struct ocfs2_lock_res *lockres = opaque;
unsigned long flags;
@@ -2595,24 +2704,9 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
lockres->l_unlock_action);
spin_lock_irqsave(&lockres->l_lock, flags);
- /* We tried to cancel a convert request, but it was already
- * granted. All we want to do here is clear our unlock
- * state. The wake_up call done at the bottom is redundant
- * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
- * hurt anything anyway */
- if (status == DLM_CANCELGRANT &&
- lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
- mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
-
- /* We don't clear the busy flag in this case as it
- * should have been cleared by the ast which the dlm
- * has called. */
- goto complete_unlock;
- }
-
- if (status != DLM_NORMAL) {
- mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
- "unlock_action %d\n", status, lockres->l_name,
+ if (error) {
+ mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+ "unlock_action %d\n", error, lockres->l_name,
lockres->l_unlock_action);
spin_unlock_irqrestore(&lockres->l_lock, flags);
return;
@@ -2624,14 +2718,13 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
lockres->l_action = OCFS2_AST_INVALID;
break;
case OCFS2_UNLOCK_DROP_LOCK:
- lockres->l_level = LKM_IVMODE;
+ lockres->l_level = DLM_LOCK_IV;
break;
default:
BUG();
}
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-complete_unlock:
lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2643,16 +2736,16 @@ complete_unlock:
static int ocfs2_drop_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
- enum dlm_status status;
+ int ret;
unsigned long flags;
- int lkm_flags = 0;
+ u32 lkm_flags = 0;
/* We didn't get anywhere near actually using this lockres. */
if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
goto out;
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
- lkm_flags |= LKM_VALBLK;
+ lkm_flags |= DLM_LKF_VALBLK;
spin_lock_irqsave(&lockres->l_lock, flags);
@@ -2678,7 +2771,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
- lockres->l_level == LKM_EXMODE &&
+ lockres->l_level == DLM_LOCK_EX &&
!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
lockres->l_ops->set_lvb(lockres);
}
@@ -2707,15 +2800,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
mlog(0, "lock %s\n", lockres->l_name);
- status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags,
- ocfs2_unlock_ast, lockres);
- if (status != DLM_NORMAL) {
- ocfs2_log_dlm_error("dlmunlock", status, lockres);
+ ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
+ lockres);
+ if (ret) {
+ ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
- dlm_print_one_lock(lockres->l_lksb.lockid);
+ ocfs2_dlm_dump_lksb(&lockres->l_lksb);
BUG();
}
- mlog(0, "lock %s, successfull return from dlmunlock\n",
+ mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
lockres->l_name);
ocfs2_wait_on_busy_lock(lockres);
@@ -2806,15 +2899,15 @@ int ocfs2_drop_inode_locks(struct inode *inode)
return status;
}
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
- int new_level)
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level)
{
assert_spin_locked(&lockres->l_lock);
- BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+ BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
if (lockres->l_level <= new_level) {
- mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
+ mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
lockres->l_level, new_level);
BUG();
}
@@ -2825,33 +2918,33 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
lockres->l_action = OCFS2_AST_DOWNCONVERT;
lockres->l_requested = new_level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+ return lockres_set_pending(lockres);
}
static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int new_level,
- int lvb)
+ int lvb,
+ unsigned int generation)
{
- int ret, dlm_flags = LKM_CONVERT;
- enum dlm_status status;
+ int ret;
+ u32 dlm_flags = DLM_LKF_CONVERT;
mlog_entry_void();
if (lvb)
- dlm_flags |= LKM_VALBLK;
-
- status = dlmlock(osb->dlm,
- new_level,
- &lockres->l_lksb,
- dlm_flags,
- lockres->l_name,
- OCFS2_LOCK_ID_MAX_LEN - 1,
- ocfs2_locking_ast,
- lockres,
- ocfs2_blocking_ast);
- if (status != DLM_NORMAL) {
- ocfs2_log_dlm_error("dlmlock", status, lockres);
- ret = -EINVAL;
+ dlm_flags |= DLM_LKF_VALBLK;
+
+ ret = ocfs2_dlm_lock(osb->cconn,
+ new_level,
+ &lockres->l_lksb,
+ dlm_flags,
+ lockres->l_name,
+ OCFS2_LOCK_ID_MAX_LEN - 1,
+ lockres);
+ lockres_clear_pending(lockres, generation, osb);
+ if (ret) {
+ ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
ocfs2_recover_from_dlm_error(lockres, 1);
goto bail;
}
@@ -2862,7 +2955,7 @@ bail:
return ret;
}
-/* returns 1 when the caller should unlock and call dlmunlock */
+/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
@@ -2898,24 +2991,18 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
int ret;
- enum dlm_status status;
mlog_entry_void();
mlog(0, "lock %s\n", lockres->l_name);
- ret = 0;
- status = dlmunlock(osb->dlm,
- &lockres->l_lksb,
- LKM_CANCEL,
- ocfs2_unlock_ast,
- lockres);
- if (status != DLM_NORMAL) {
- ocfs2_log_dlm_error("dlmunlock", status, lockres);
- ret = -EINVAL;
+ ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
+ DLM_LKF_CANCEL, lockres);
+ if (ret) {
+ ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
ocfs2_recover_from_dlm_error(lockres, 0);
}
- mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+ mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
mlog_exit(ret);
return ret;
@@ -2930,6 +3017,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
int new_level;
int ret = 0;
int set_lvb = 0;
+ unsigned int gen;
mlog_entry_void();
@@ -2939,6 +3027,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
recheck:
if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+ /* XXX
+ * This is a *big* race. The OCFS2_LOCK_PENDING flag
+ * exists entirely for one reason - another thread has set
+ * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
+ *
+ * If we do ocfs2_cancel_convert() before the other thread
+ * calls dlm_lock(), our cancel will do nothing. We will
+ * get no ast, and we will have no way of knowing the
+ * cancel failed. Meanwhile, the other thread will call
+ * into dlm_lock() and wait...forever.
+ *
+ * Why forever? Because another node has asked for the
+ * lock first; that's why we're here in unblock_lock().
+ *
+ * The solution is OCFS2_LOCK_PENDING. When PENDING is
+ * set, we just requeue the unblock. Only when the other
+ * thread has called dlm_lock() and cleared PENDING will
+ * we then cancel their request.
+ *
+ * All callers of dlm_lock() must set OCFS2_DLM_PENDING
+ * at the same time they set OCFS2_DLM_BUSY. They must
+ * clear OCFS2_DLM_PENDING after dlm_lock() returns.
+ */
+ if (lockres->l_flags & OCFS2_LOCK_PENDING)
+ goto leave_requeue;
+
ctl->requeue = 1;
ret = ocfs2_prepare_cancel_convert(osb, lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2952,13 +3066,13 @@ recheck:
/* if we're blocking an exclusive and we have *any* holders,
* then requeue. */
- if ((lockres->l_blocking == LKM_EXMODE)
+ if ((lockres->l_blocking == DLM_LOCK_EX)
&& (lockres->l_ex_holders || lockres->l_ro_holders))
goto leave_requeue;
/* If it's a PR we're blocking, then only
* requeue if we've got any EX holders */
- if (lockres->l_blocking == LKM_PRMODE &&
+ if (lockres->l_blocking == DLM_LOCK_PR &&
lockres->l_ex_holders)
goto leave_requeue;
@@ -3005,7 +3119,7 @@ downconvert:
ctl->requeue = 0;
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
- if (lockres->l_level == LKM_EXMODE)
+ if (lockres->l_level == DLM_LOCK_EX)
set_lvb = 1;
/*
@@ -3018,9 +3132,11 @@ downconvert:
lockres->l_ops->set_lvb(lockres);
}
- ocfs2_prepare_downconvert(lockres, new_level);
+ gen = ocfs2_prepare_downconvert(lockres, new_level);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+ ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
+ gen);
+
leave:
mlog_exit(ret);
return ret;
@@ -3059,7 +3175,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
}
sync_mapping_buffers(mapping);
- if (blocking == LKM_EXMODE) {
+ if (blocking == DLM_LOCK_EX) {
truncate_inode_pages(mapping, 0);
} else {
/* We only need to wait on the I/O if we're not also
@@ -3080,8 +3196,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
struct inode *inode = ocfs2_lock_res_inode(lockres);
int checkpointed = ocfs2_inode_fully_checkpointed(inode);
- BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
- BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed);
+ BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
+ BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
if (checkpointed)
return 1;
@@ -3145,7 +3261,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
* valid. The downconvert code will retain a PR for this node,
* so there's no further work to do.
*/
- if (blocking == LKM_PRMODE)
+ if (blocking == DLM_LOCK_PR)
return UNBLOCK_CONTINUE;
/*
@@ -3219,6 +3335,45 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
return UNBLOCK_CONTINUE_POST;
}
+/*
+ * This is the filesystem locking protocol. It provides the lock handling
+ * hooks for the underlying DLM. It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed. The protocol is negotiated when joining
+ * the dlm domain. A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes. When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero. If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased. If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+ .lp_max_version = {
+ .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+ .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+ },
+ .lp_lock_ast = ocfs2_locking_ast,
+ .lp_blocking_ast = ocfs2_blocking_ast,
+ .lp_unlock_ast = ocfs2_unlock_ast,
+};
+
+void ocfs2_set_locking_protocol(void)
+{
+ ocfs2_stack_glue_set_locking_protocol(&lproto);
+}
+
+
static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e3cf902404b..2bb01f09c1b 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -58,7 +58,7 @@ struct ocfs2_meta_lvb {
#define OCFS2_LOCK_NONBLOCK (0x04)
int ocfs2_dlm_init(struct ocfs2_super *osb);
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
@@ -114,5 +114,6 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
-extern const struct dlm_protocol_version ocfs2_locking_protocol;
+/* To set the locking protocol on module initialization */
+void ocfs2_set_locking_protocol(void);
#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed5d5232e85..9154c82d325 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = {
.open = ocfs2_file_open,
.aio_read = ocfs2_file_aio_read,
.aio_write = ocfs2_file_aio_write,
- .ioctl = ocfs2_ioctl,
+ .unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
@@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = {
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
- .ioctl = ocfs2_ioctl,
+ .unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0758daf64da..c6e7213db86 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,9 +28,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/kmod.h>
-
-#include <dlm/dlmapi.h>
#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
@@ -48,7 +45,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit);
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
int bit);
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
/* special case -1 for now
* TODO: should *really* make sure the calling func never passes -1!! */
@@ -62,23 +58,23 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map)
void ocfs2_init_node_maps(struct ocfs2_super *osb)
{
spin_lock_init(&osb->node_map_lock);
- ocfs2_node_map_init(&osb->recovery_map);
ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
}
-static void ocfs2_do_node_down(int node_num,
- struct ocfs2_super *osb)
+void ocfs2_do_node_down(int node_num, void *data)
{
+ struct ocfs2_super *osb = data;
+
BUG_ON(osb->node_num == node_num);
mlog(0, "ocfs2: node down event for %d\n", node_num);
- if (!osb->dlm) {
+ if (!osb->cconn) {
/*
- * No DLM means we're not even ready to participate yet.
- * We check the slots after the DLM comes up, so we will
- * notice the node death then. We can safely ignore it
- * here.
+ * No cluster connection means we're not even ready to
+ * participate yet. We check the slots after the cluster
+ * comes up, so we will notice the node death then. We
+ * can safely ignore it here.
*/
return;
}
@@ -86,61 +82,6 @@ static void ocfs2_do_node_down(int node_num,
ocfs2_recovery_thread(osb, node_num);
}
-/* Called from the dlm when it's about to evict a node. We may also
- * get a heartbeat callback later. */
-static void ocfs2_dlm_eviction_cb(int node_num,
- void *data)
-{
- struct ocfs2_super *osb = (struct ocfs2_super *) data;
- struct super_block *sb = osb->sb;
-
- mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
- MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
-
- ocfs2_do_node_down(node_num, osb);
-}
-
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
-{
- /* Not exactly a heartbeat callback, but leads to essentially
- * the same path so we set it up here. */
- dlm_setup_eviction_cb(&osb->osb_eviction_cb,
- ocfs2_dlm_eviction_cb,
- osb);
-}
-
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
-{
- int ret;
- char *argv[5], *envp[3];
-
- if (ocfs2_mount_local(osb))
- return;
-
- if (!osb->uuid_str) {
- /* This can happen if we don't get far enough in mount... */
- mlog(0, "No UUID with which to stop heartbeat!\n\n");
- return;
- }
-
- argv[0] = (char *)o2nm_get_hb_ctl_path();
- argv[1] = "-K";
- argv[2] = "-u";
- argv[3] = osb->uuid_str;
- argv[4] = NULL;
-
- mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-
- /* minimal command environment taken from cpu_run_sbin_hotplug */
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
-
- ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
- if (ret < 0)
- mlog_errno(ret);
-}
-
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit)
{
@@ -192,112 +133,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
return ret;
}
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
-{
- int bit;
- bit = find_next_bit(map->map, map->num_nodes, 0);
- if (bit < map->num_nodes)
- return 0;
- return 1;
-}
-
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
- struct ocfs2_node_map *map)
-{
- int ret;
- BUG_ON(map->num_nodes == 0);
- spin_lock(&osb->node_map_lock);
- ret = __ocfs2_node_map_is_empty(map);
- spin_unlock(&osb->node_map_lock);
- return ret;
-}
-
-#if 0
-
-static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
- struct ocfs2_node_map *from)
-{
- BUG_ON(from->num_nodes == 0);
- ocfs2_node_map_init(target);
- __ocfs2_node_map_set(target, from);
-}
-
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs2_node_map_is_only(struct ocfs2_super *osb,
- struct ocfs2_node_map *target,
- int bit)
-{
- struct ocfs2_node_map temp;
- int ret;
-
- spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_dup(&temp, target);
- __ocfs2_node_map_clear_bit(&temp, bit);
- ret = __ocfs2_node_map_is_empty(&temp);
- spin_unlock(&osb->node_map_lock);
-
- return ret;
-}
-
-static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
- struct ocfs2_node_map *from)
-{
- int num_longs, i;
-
- BUG_ON(target->num_nodes != from->num_nodes);
- BUG_ON(target->num_nodes == 0);
-
- num_longs = BITS_TO_LONGS(target->num_nodes);
- for (i = 0; i < num_longs; i++)
- target->map[i] = from->map[i];
-}
-
-#endif /* 0 */
-
-/* Returns whether the recovery bit was actually set - it may not be
- * if a node is still marked as needing recovery */
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
- int num)
-{
- int set = 0;
-
- spin_lock(&osb->node_map_lock);
-
- if (!test_bit(num, osb->recovery_map.map)) {
- __ocfs2_node_map_set_bit(&osb->recovery_map, num);
- set = 1;
- }
-
- spin_unlock(&osb->node_map_lock);
-
- return set;
-}
-
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
- int num)
-{
- ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
-}
-
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
- struct ocfs2_node_map *map,
- int idx)
-{
- int i = idx;
-
- idx = O2NM_INVALID_NODE_NUM;
- spin_lock(&osb->node_map_lock);
- if ((i != O2NM_INVALID_NODE_NUM) &&
- (i >= 0) &&
- (i < map->num_nodes)) {
- while(i < map->num_nodes) {
- if (test_bit(i, map->map)) {
- idx = i;
- break;
- }
- i++;
- }
- }
- spin_unlock(&osb->node_map_lock);
- return idx;
-}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index eac63aed761..74b9c5dda28 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -28,13 +28,10 @@
void ocfs2_init_node_maps(struct ocfs2_super *osb);
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
+void ocfs2_do_node_down(int node_num, void *data);
/* node map functions - used to keep track of mounted and in-recovery
* nodes. */
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
- struct ocfs2_node_map *map);
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
@@ -44,17 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
- struct ocfs2_node_map *map,
- int idx);
-static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
- struct ocfs2_node_map *map)
-{
- return ocfs2_node_map_iterate(osb, map, 0);
-}
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
- int num);
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
- int num);
#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 5177fba5162..7b142f0ce99 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/smp_lock.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@@ -59,10 +60,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
goto bail;
}
- status = -EROFS;
- if (IS_RDONLY(inode))
- goto bail_unlock;
-
status = -EACCES;
if (!is_owner_or_cap(inode))
goto bail_unlock;
@@ -112,9 +109,9 @@ bail:
return status;
}
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
- unsigned int cmd, unsigned long arg)
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
+ struct inode *inode = filp->f_path.dentry->d_inode;
unsigned int flags;
int new_clusters;
int status;
@@ -133,8 +130,13 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- return ocfs2_set_inode_attr(inode, flags,
+ status = mnt_want_write(filp->f_path.mnt);
+ if (status)
+ return status;
+ status = ocfs2_set_inode_attr(inode, flags,
OCFS2_FL_MODIFIABLE);
+ mnt_drop_write(filp->f_path.mnt);
+ return status;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
@@ -168,9 +170,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
#ifdef CONFIG_COMPAT
long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- int ret;
-
switch (cmd) {
case OCFS2_IOC32_GETFLAGS:
cmd = OCFS2_IOC_GETFLAGS;
@@ -190,9 +189,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return -ENOIOCTLCMD;
}
- lock_kernel();
- ret = ocfs2_ioctl(inode, file, cmd, arg);
- unlock_kernel();
- return ret;
+ return ocfs2_ioctl(file, cmd, arg);
}
#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4d6c4f430d0..cf9a5ee30fe 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -10,8 +10,7 @@
#ifndef OCFS2_IOCTL_H
#define OCFS2_IOCTL_H
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
- unsigned int cmd, unsigned long arg);
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f31c7e8c19c..9698338adc3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
int slot);
static int ocfs2_commit_thread(void *arg);
+
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+
+struct ocfs2_recovery_map {
+ unsigned int rm_used;
+ unsigned int *rm_entries;
+};
+
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_recovery_map *rm;
+
+ mutex_init(&osb->recovery_lock);
+ osb->disable_recovery = 0;
+ osb->recovery_thread_task = NULL;
+ init_waitqueue_head(&osb->recovery_event);
+
+ rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+ osb->max_slots * sizeof(unsigned int),
+ GFP_KERNEL);
+ if (!rm) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
+ rm->rm_entries = (unsigned int *)((char *)rm +
+ sizeof(struct ocfs2_recovery_map));
+ osb->recovery_map = rm;
+
+ return 0;
+}
+
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+ mb();
+ return osb->recovery_thread_task != NULL;
+}
+
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+ struct ocfs2_recovery_map *rm;
+
+ /* disable any new recovery threads and wait for any currently
+ * running ones to exit. Do this before setting the vol_state. */
+ mutex_lock(&osb->recovery_lock);
+ osb->disable_recovery = 1;
+ mutex_unlock(&osb->recovery_lock);
+ wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+
+ /* At this point, we know that no more recovery threads can be
+ * launched, so wait for any recovery completion work to
+ * complete. */
+ flush_workqueue(ocfs2_wq);
+
+ /*
+ * Now that recovery is shut down, and the osb is about to be
+ * freed, the osb_lock is not taken here.
+ */
+ rm = osb->recovery_map;
+ /* XXX: Should we bug if there are dirty entries? */
+
+ kfree(rm);
+}
+
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+ unsigned int node_num)
+{
+ int i;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ assert_spin_locked(&osb->osb_lock);
+
+ for (i = 0; i < rm->rm_used; i++) {
+ if (rm->rm_entries[i] == node_num)
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Behaves like test-and-set. Returns the previous value */
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+ unsigned int node_num)
+{
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ spin_lock(&osb->osb_lock);
+ if (__ocfs2_recovery_map_test(osb, node_num)) {
+ spin_unlock(&osb->osb_lock);
+ return 1;
+ }
+
+ /* XXX: Can this be exploited? Not from o2dlm... */
+ BUG_ON(rm->rm_used >= osb->max_slots);
+
+ rm->rm_entries[rm->rm_used] = node_num;
+ rm->rm_used++;
+ spin_unlock(&osb->osb_lock);
+
+ return 0;
+}
+
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+ unsigned int node_num)
+{
+ int i;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ spin_lock(&osb->osb_lock);
+
+ for (i = 0; i < rm->rm_used; i++) {
+ if (rm->rm_entries[i] == node_num)
+ break;
+ }
+
+ if (i < rm->rm_used) {
+ /* XXX: be careful with the pointer math */
+ memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+ (rm->rm_used - i - 1) * sizeof(unsigned int));
+ rm->rm_used--;
+ }
+
+ spin_unlock(&osb->osb_lock);
+}
+
static int ocfs2_commit_cache(struct ocfs2_super *osb)
{
int status = 0;
@@ -586,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
mlog_entry_void();
- if (!journal)
- BUG();
+ BUG_ON(!journal);
osb = journal->j_osb;
@@ -650,6 +780,23 @@ bail:
return status;
}
+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+ int empty;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ spin_lock(&osb->osb_lock);
+ empty = (rm->rm_used == 0);
+ spin_unlock(&osb->osb_lock);
+
+ return empty;
+}
+
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+ wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
+
/*
* JBD Might read a cached version of another nodes journal file. We
* don't want this as this file changes often and we get no
@@ -848,6 +995,7 @@ static int __ocfs2_recovery_thread(void *arg)
{
int status, node_num;
struct ocfs2_super *osb = arg;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
mlog_entry_void();
@@ -863,26 +1011,29 @@ restart:
goto bail;
}
- while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
- node_num = ocfs2_node_map_first_set_bit(osb,
- &osb->recovery_map);
- if (node_num == O2NM_INVALID_NODE_NUM) {
- mlog(0, "Out of nodes to recover.\n");
- break;
- }
+ spin_lock(&osb->osb_lock);
+ while (rm->rm_used) {
+ /* It's always safe to remove entry zero, as we won't
+ * clear it until ocfs2_recover_node() has succeeded. */
+ node_num = rm->rm_entries[0];
+ spin_unlock(&osb->osb_lock);
status = ocfs2_recover_node(osb, node_num);
- if (status < 0) {
+ if (!status) {
+ ocfs2_recovery_map_clear(osb, node_num);
+ } else {
mlog(ML_ERROR,
"Error %d recovering node %d on device (%u,%u)!\n",
status, node_num,
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
mlog(ML_ERROR, "Volume requires unmount.\n");
- continue;
}
- ocfs2_recovery_map_clear(osb, node_num);
+ spin_lock(&osb->osb_lock);
}
+ spin_unlock(&osb->osb_lock);
+ mlog(0, "All nodes recovered\n");
+
ocfs2_super_unlock(osb, 1);
/* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1044,7 @@ restart:
bail:
mutex_lock(&osb->recovery_lock);
- if (!status &&
- !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+ if (!status && !ocfs2_recovery_completed(osb)) {
mutex_unlock(&osb->recovery_lock);
goto restart;
}
@@ -924,8 +1074,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
/* People waiting on recovery will wait on
* the recovery map to empty. */
- if (!ocfs2_recovery_map_set(osb, node_num))
- mlog(0, "node %d already be in recovery.\n", node_num);
+ if (ocfs2_recovery_map_set(osb, node_num))
+ mlog(0, "node %d already in recovery map.\n", node_num);
mlog(0, "starting recovery thread...\n");
@@ -1079,7 +1229,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
{
int status = 0;
int slot_num;
- struct ocfs2_slot_info *si = osb->slot_info;
struct ocfs2_dinode *la_copy = NULL;
struct ocfs2_dinode *tl_copy = NULL;
@@ -1092,8 +1241,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
* case we should've called ocfs2_journal_load instead. */
BUG_ON(osb->node_num == node_num);
- slot_num = ocfs2_node_num_to_slot(si, node_num);
- if (slot_num == OCFS2_INVALID_SLOT) {
+ slot_num = ocfs2_node_num_to_slot(osb, node_num);
+ if (slot_num == -ENOENT) {
status = 0;
mlog(0, "no slot for this node, so no recovery required.\n");
goto done;
@@ -1123,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
/* Likewise, this would be a strange but ultimately not so
* harmful place to get an error... */
- ocfs2_clear_slot(si, slot_num);
- status = ocfs2_update_disk_slots(osb, si);
+ status = ocfs2_clear_slot(osb, slot_num);
if (status < 0)
mlog_errno(status);
@@ -1184,23 +1332,24 @@ bail:
* slot info struct has been updated from disk. */
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
{
- int status, i, node_num;
- struct ocfs2_slot_info *si = osb->slot_info;
+ unsigned int node_num;
+ int status, i;
/* This is called with the super block cluster lock, so we
* know that the slot map can't change underneath us. */
- spin_lock(&si->si_lock);
- for(i = 0; i < si->si_num_slots; i++) {
+ spin_lock(&osb->osb_lock);
+ for (i = 0; i < osb->max_slots; i++) {
if (i == osb->slot_num)
continue;
- if (ocfs2_is_empty_slot(si, i))
+
+ status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
+ if (status == -ENOENT)
continue;
- node_num = si->si_global_node_nums[i];
- if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+ if (__ocfs2_recovery_map_test(osb, node_num))
continue;
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);
/* Ok, we have a slot occupied by another node which
* is not in the recovery map. We trylock his journal
@@ -1216,9 +1365,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
goto bail;
}
- spin_lock(&si->si_lock);
+ spin_lock(&osb->osb_lock);
}
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);
status = 0;
bail:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 220f3e818e7..db82be2532e 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
/* Exported only for the journal struct init code in super.c. Do not call. */
void ocfs2_complete_recovery(struct work_struct *work);
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
+
+int ocfs2_recovery_init(struct ocfs2_super *osb);
+void ocfs2_recovery_exit(struct ocfs2_super *osb);
/*
* Journal Control:
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ab83fd56242..ce0dc147602 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -447,6 +447,8 @@ out_mutex:
iput(main_bm_inode);
out:
+ if (!status)
+ ocfs2_init_inode_steal_slot(osb);
mlog_exit(status);
return status;
}
@@ -523,6 +525,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
}
ac->ac_inode = local_alloc_inode;
+ /* We should never use localalloc from another slot */
+ ac->ac_alloc_slot = osb->slot_num;
ac->ac_which = OCFS2_AC_USE_LOCAL;
get_bh(osb->local_alloc_bh);
ac->ac_bh = osb->local_alloc_bh;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ae9ad958751..d5d808fe014 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
fe->i_blkno = cpu_to_le64(fe_blkno);
fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
- fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+ fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
fe->i_uid = cpu_to_le32(current->fsuid);
if (dir->i_mode & S_ISGID) {
fe->i_gid = cpu_to_le32(dir->i_gid);
@@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir,
*
* And that's why, just like the VFS, we need a file system
* rename lock. */
- if (old_dentry != new_dentry) {
+ if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
status = ocfs2_rename_lock(osb);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6546cef212e..31692379c17 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -36,11 +36,8 @@
#include <linux/mutex.h>
#include <linux/jbd.h>
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlm/dlmapi.h"
+/* For union ocfs2_dlm_lksb */
+#include "stackglue.h"
#include "ocfs2_fs.h"
#include "ocfs2_lockid.h"
@@ -101,6 +98,9 @@ enum ocfs2_unlock_action {
* dropped. */
#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
+#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
+ call to dlm_lock. Only
+ exists with BUSY set. */
struct ocfs2_lock_res_ops;
@@ -120,13 +120,14 @@ struct ocfs2_lock_res {
int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- struct dlm_lockstatus l_lksb;
+ union ocfs2_dlm_lksb l_lksb;
/* used from AST/BAST funcs. */
enum ocfs2_ast_action l_action;
enum ocfs2_unlock_action l_unlock_action;
int l_requested;
int l_blocking;
+ unsigned int l_pending_gen;
wait_queue_head_t l_event;
@@ -179,6 +180,8 @@ enum ocfs2_mount_options
#define OCFS2_DEFAULT_ATIME_QUANTUM 60
struct ocfs2_journal;
+struct ocfs2_slot_info;
+struct ocfs2_recovery_map;
struct ocfs2_super
{
struct task_struct *commit_task;
@@ -190,7 +193,6 @@ struct ocfs2_super
struct ocfs2_slot_info *slot_info;
spinlock_t node_map_lock;
- struct ocfs2_node_map recovery_map;
u64 root_blkno;
u64 system_dir_blkno;
@@ -206,25 +208,29 @@ struct ocfs2_super
u32 s_feature_incompat;
u32 s_feature_ro_compat;
- /* Protects s_next_generaion, osb_flags. Could protect more on
- * osb as it's very short lived. */
+ /* Protects s_next_generation, osb_flags and s_inode_steal_slot.
+ * Could protect more on osb as it's very short lived.
+ */
spinlock_t osb_lock;
u32 s_next_generation;
unsigned long osb_flags;
+ s16 s_inode_steal_slot;
+ atomic_t s_num_inodes_stolen;
unsigned long s_mount_opt;
unsigned int s_atime_quantum;
- u16 max_slots;
- s16 node_num;
- s16 slot_num;
- s16 preferred_slot;
+ unsigned int max_slots;
+ unsigned int node_num;
+ int slot_num;
+ int preferred_slot;
int s_sectsize_bits;
int s_clustersize;
int s_clustersize_bits;
atomic_t vol_state;
struct mutex recovery_lock;
+ struct ocfs2_recovery_map *recovery_map;
struct task_struct *recovery_thread_task;
int disable_recovery;
wait_queue_head_t checkpoint_event;
@@ -245,12 +251,11 @@ struct ocfs2_super
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
- struct dlm_ctxt *dlm;
+ char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+ struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
- struct dlm_eviction_cb osb_eviction_cb;
struct ocfs2_dlm_debug *osb_dlm_debug;
- struct dlm_protocol_version osb_locking_proto;
struct dentry *osb_debug_root;
@@ -367,11 +372,24 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
return ret;
}
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+ return (osb->s_feature_incompat &
+ OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+}
+
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
{
return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
}
+static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
+{
+ return (osb->s_feature_incompat &
+ OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
+}
+
+
#define OCFS2_IS_VALID_DINODE(ptr) \
(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
@@ -522,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
return pages_per_cluster;
}
+static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+ spin_lock(&osb->osb_lock);
+ osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+
+static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
+ s16 slot)
+{
+ spin_lock(&osb->osb_lock);
+ osb->s_inode_steal_slot = slot;
+ spin_unlock(&osb->osb_lock);
+}
+
+static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+ s16 slot;
+
+ spin_lock(&osb->osb_lock);
+ slot = osb->s_inode_steal_slot;
+ spin_unlock(&osb->osb_lock);
+
+ return slot;
+}
+
#define ocfs2_set_bit ext2_set_bit
#define ocfs2_clear_bit ext2_clear_bit
#define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3633edd3982..52c42666515 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,9 @@
#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
| OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
- | OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
+ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
+ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
/*
@@ -125,6 +127,21 @@
/* Support for data packed into inode blocks */
#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+
+/*
+ * Support for alternate, userspace cluster stacks. If set, the superblock
+ * field s_cluster_info contains a tag for the alternate stack in use as
+ * well as the name of the cluster being joined.
+ * mount.ocfs2 must pass in a matching stack name.
+ *
+ * If not set, the classic stack will be used. This is compatbile with
+ * all older versions.
+ */
+#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -267,6 +284,10 @@ struct ocfs2_new_group_input {
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
+/* The alternate, userspace stack fields */
+#define OCFS2_STACK_LABEL_LEN 4
+#define OCFS2_CLUSTER_NAME_LEN 16
+
/* Journal limits (in bytes) */
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
@@ -475,6 +496,47 @@ struct ocfs2_extent_block
};
/*
+ * On disk slot map for OCFS2. This defines the contents of the "slot_map"
+ * system file. A slot is valid if it contains a node number >= 0. The
+ * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty.
+ */
+struct ocfs2_slot_map {
+/*00*/ __le16 sm_slots[0];
+/*
+ * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255,
+ * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
+ */
+};
+
+struct ocfs2_extended_slot {
+/*00*/ __u8 es_valid;
+ __u8 es_reserved1[3];
+ __le32 es_node_num;
+/*10*/
+};
+
+/*
+ * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
+ * is set. It separates out the valid marker from the node number, and
+ * has room to grow. Unlike the old slot map, this format is defined by
+ * i_size.
+ */
+struct ocfs2_slot_map_extended {
+/*00*/ struct ocfs2_extended_slot se_slots[0];
+/*
+ * Actual size is i_size of the slot_map system file. It should
+ * match s_max_slots * sizeof(struct ocfs2_extended_slot)
+ */
+};
+
+struct ocfs2_cluster_info {
+/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
+ __le32 ci_reserved;
+/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
+/*18*/
+};
+
+/*
* On disk superblock for OCFS2
* Note that it is contained inside an ocfs2_dinode, so all offsets
* are relative to the start of ocfs2_dinode.id2.
@@ -506,7 +568,20 @@ struct ocfs2_super_block {
* group header */
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
-/*A0*/
+/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
+ stack. Only valid
+ with INCOMPAT flag. */
+/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */
+/*140*/
+
+ /*
+ * NOTE: As stated above, all offsets are relative to
+ * ocfs2_dinode.id2, which is at 0xC0 in the inode.
+ * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within
+ * our smallest blocksize, which is 512 bytes. To ensure this,
+ * we reserve the space in s_reserved2. Anything past s_reserved2
+ * will not be available on the smallest blocksize.
+ */
};
/*
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 86f3e3799c2..82c200f7a8f 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = {
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
{
#ifdef __KERNEL__
- mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+ BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
#endif
return ocfs2_lock_type_strings[type];
}
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 3a50ce555e6..bb5ff8939bf 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,81 +42,244 @@
#include "buffer_head_io.h"
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global);
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
- s16 slot_num,
- s16 node_num);
-
-/* post the slot information on disk into our slot_info struct. */
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+
+struct ocfs2_slot {
+ int sl_valid;
+ unsigned int sl_node_num;
+};
+
+struct ocfs2_slot_info {
+ int si_extended;
+ int si_slots_per_block;
+ struct inode *si_inode;
+ unsigned int si_blocks;
+ struct buffer_head **si_bh;
+ unsigned int si_num_slots;
+ struct ocfs2_slot *si_slots;
+};
+
+
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+ unsigned int node_num);
+
+static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
+ int slot_num)
+{
+ BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+ si->si_slots[slot_num].sl_valid = 0;
+}
+
+static void ocfs2_set_slot(struct ocfs2_slot_info *si,
+ int slot_num, unsigned int node_num)
+{
+ BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+
+ si->si_slots[slot_num].sl_valid = 1;
+ si->si_slots[slot_num].sl_node_num = node_num;
+}
+
+/* This version is for the extended slot map */
+static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
+{
+ int b, i, slotno;
+ struct ocfs2_slot_map_extended *se;
+
+ slotno = 0;
+ for (b = 0; b < si->si_blocks; b++) {
+ se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
+ for (i = 0;
+ (i < si->si_slots_per_block) &&
+ (slotno < si->si_num_slots);
+ i++, slotno++) {
+ if (se->se_slots[i].es_valid)
+ ocfs2_set_slot(si, slotno,
+ le32_to_cpu(se->se_slots[i].es_node_num));
+ else
+ ocfs2_invalidate_slot(si, slotno);
+ }
+ }
+}
+
+/*
+ * Post the slot information on disk into our slot_info struct.
+ * Must be protected by osb_lock.
+ */
+static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
{
int i;
- __le16 *disk_info;
+ struct ocfs2_slot_map *sm;
- /* we don't read the slot block here as ocfs2_super_lock
- * should've made sure we have the most recent copy. */
- spin_lock(&si->si_lock);
- disk_info = (__le16 *) si->si_bh->b_data;
+ sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
- for (i = 0; i < si->si_size; i++)
- si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+ for (i = 0; i < si->si_num_slots; i++) {
+ if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
+ ocfs2_invalidate_slot(si, i);
+ else
+ ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
+ }
+}
- spin_unlock(&si->si_lock);
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+ /*
+ * The slot data will have been refreshed when ocfs2_super_lock
+ * was taken.
+ */
+ if (si->si_extended)
+ ocfs2_update_slot_info_extended(si);
+ else
+ ocfs2_update_slot_info_old(si);
+}
+
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
+{
+ int ret;
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ if (si == NULL)
+ return 0;
+
+ BUG_ON(si->si_blocks == 0);
+ BUG_ON(si->si_bh == NULL);
+
+ mlog(0, "Refreshing slot map, reading %u block(s)\n",
+ si->si_blocks);
+
+ /*
+ * We pass -1 as blocknr because we expect all of si->si_bh to
+ * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
+ * this is not true, the read of -1 (UINT64_MAX) will fail.
+ */
+ ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
+ si->si_inode);
+ if (ret == 0) {
+ spin_lock(&osb->osb_lock);
+ ocfs2_update_slot_info(si);
+ spin_unlock(&osb->osb_lock);
+ }
+
+ return ret;
}
/* post the our slot info stuff into it's destination bh and write it
* out. */
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
- struct ocfs2_slot_info *si)
+static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
+ int slot_num,
+ struct buffer_head **bh)
{
- int status, i;
- __le16 *disk_info = (__le16 *) si->si_bh->b_data;
+ int blkind = slot_num / si->si_slots_per_block;
+ int slotno = slot_num % si->si_slots_per_block;
+ struct ocfs2_slot_map_extended *se;
+
+ BUG_ON(blkind >= si->si_blocks);
+
+ se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
+ se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
+ if (si->si_slots[slot_num].sl_valid)
+ se->se_slots[slotno].es_node_num =
+ cpu_to_le32(si->si_slots[slot_num].sl_node_num);
+ *bh = si->si_bh[blkind];
+}
- spin_lock(&si->si_lock);
- for (i = 0; i < si->si_size; i++)
- disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
- spin_unlock(&si->si_lock);
+static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
+ int slot_num,
+ struct buffer_head **bh)
+{
+ int i;
+ struct ocfs2_slot_map *sm;
+
+ sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
+ for (i = 0; i < si->si_num_slots; i++) {
+ if (si->si_slots[i].sl_valid)
+ sm->sm_slots[i] =
+ cpu_to_le16(si->si_slots[i].sl_node_num);
+ else
+ sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+ }
+ *bh = si->si_bh[0];
+}
+
+static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
+ struct ocfs2_slot_info *si,
+ int slot_num)
+{
+ int status;
+ struct buffer_head *bh;
+
+ spin_lock(&osb->osb_lock);
+ if (si->si_extended)
+ ocfs2_update_disk_slot_extended(si, slot_num, &bh);
+ else
+ ocfs2_update_disk_slot_old(si, slot_num, &bh);
+ spin_unlock(&osb->osb_lock);
- status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+ status = ocfs2_write_block(osb, bh, si->si_inode);
if (status < 0)
mlog_errno(status);
return status;
}
-/* try to find global node in the slot info. Returns
- * OCFS2_INVALID_SLOT if nothing is found. */
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global)
+/*
+ * Calculate how many bytes are needed by the slot map. Returns
+ * an error if the slot map file is too small.
+ */
+static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
+ struct inode *inode,
+ unsigned long long *bytes)
{
- int i;
- s16 ret = OCFS2_INVALID_SLOT;
+ unsigned long long bytes_needed;
+
+ if (ocfs2_uses_extended_slot_map(osb)) {
+ bytes_needed = osb->max_slots *
+ sizeof(struct ocfs2_extended_slot);
+ } else {
+ bytes_needed = osb->max_slots * sizeof(__le16);
+ }
+ if (bytes_needed > i_size_read(inode)) {
+ mlog(ML_ERROR,
+ "Slot map file is too small! (size %llu, needed %llu)\n",
+ i_size_read(inode), bytes_needed);
+ return -ENOSPC;
+ }
+
+ *bytes = bytes_needed;
+ return 0;
+}
+
+/* try to find global node in the slot info. Returns -ENOENT
+ * if nothing is found. */
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+ unsigned int node_num)
+{
+ int i, ret = -ENOENT;
for(i = 0; i < si->si_num_slots; i++) {
- if (global == si->si_global_node_nums[i]) {
- ret = (s16) i;
+ if (si->si_slots[i].sl_valid &&
+ (node_num == si->si_slots[i].sl_node_num)) {
+ ret = i;
break;
}
}
+
return ret;
}
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
+static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+ int preferred)
{
- int i;
- s16 ret = OCFS2_INVALID_SLOT;
+ int i, ret = -ENOSPC;
- if (preferred >= 0 && preferred < si->si_num_slots) {
- if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
+ if ((preferred >= 0) && (preferred < si->si_num_slots)) {
+ if (!si->si_slots[preferred].sl_valid) {
ret = preferred;
goto out;
}
}
for(i = 0; i < si->si_num_slots; i++) {
- if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
- ret = (s16) i;
+ if (!si->si_slots[i].sl_valid) {
+ ret = i;
break;
}
}
@@ -124,58 +287,155 @@ out:
return ret;
}
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global)
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
{
- s16 ret;
+ int slot;
+ struct ocfs2_slot_info *si = osb->slot_info;
- spin_lock(&si->si_lock);
- ret = __ocfs2_node_num_to_slot(si, global);
- spin_unlock(&si->si_lock);
- return ret;
+ spin_lock(&osb->osb_lock);
+ slot = __ocfs2_node_num_to_slot(si, node_num);
+ spin_unlock(&osb->osb_lock);
+
+ return slot;
+}
+
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+ unsigned int *node_num)
+{
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ assert_spin_locked(&osb->osb_lock);
+
+ BUG_ON(slot_num < 0);
+ BUG_ON(slot_num > osb->max_slots);
+
+ if (!si->si_slots[slot_num].sl_valid)
+ return -ENOENT;
+
+ *node_num = si->si_slots[slot_num].sl_node_num;
+ return 0;
}
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
- s16 slot_num,
- s16 node_num)
+static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
{
- BUG_ON(slot_num == OCFS2_INVALID_SLOT);
- BUG_ON(slot_num >= si->si_num_slots);
- BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
- (node_num >= O2NM_MAX_NODES));
+ unsigned int i;
+
+ if (si == NULL)
+ return;
+
+ if (si->si_inode)
+ iput(si->si_inode);
+ if (si->si_bh) {
+ for (i = 0; i < si->si_blocks; i++) {
+ if (si->si_bh[i]) {
+ brelse(si->si_bh[i]);
+ si->si_bh[i] = NULL;
+ }
+ }
+ kfree(si->si_bh);
+ }
- si->si_global_node_nums[slot_num] = node_num;
+ kfree(si);
}
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
- s16 slot_num)
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
{
- spin_lock(&si->si_lock);
- __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
- spin_unlock(&si->si_lock);
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ if (si == NULL)
+ return 0;
+
+ spin_lock(&osb->osb_lock);
+ ocfs2_invalidate_slot(si, slot_num);
+ spin_unlock(&osb->osb_lock);
+
+ return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
}
-int ocfs2_init_slot_info(struct ocfs2_super *osb)
+static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
+ struct ocfs2_slot_info *si)
{
- int status, i;
+ int status = 0;
u64 blkno;
+ unsigned long long blocks, bytes;
+ unsigned int i;
+ struct buffer_head *bh;
+
+ status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
+ if (status)
+ goto bail;
+
+ blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
+ BUG_ON(blocks > UINT_MAX);
+ si->si_blocks = blocks;
+ if (!si->si_blocks)
+ goto bail;
+
+ if (si->si_extended)
+ si->si_slots_per_block =
+ (osb->sb->s_blocksize /
+ sizeof(struct ocfs2_extended_slot));
+ else
+ si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
+
+ /* The size checks above should ensure this */
+ BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
+
+ mlog(0, "Slot map needs %u buffers for %llu bytes\n",
+ si->si_blocks, bytes);
+
+ si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+ GFP_KERNEL);
+ if (!si->si_bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ for (i = 0; i < si->si_blocks; i++) {
+ status = ocfs2_extent_map_get_blocks(si->si_inode, i,
+ &blkno, NULL, NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ mlog(0, "Reading slot map block %u at %llu\n", i,
+ (unsigned long long)blkno);
+
+ bh = NULL; /* Acquire a fresh bh */
+ status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ si->si_bh[i] = bh;
+ }
+
+bail:
+ return status;
+}
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+ int status;
struct inode *inode = NULL;
- struct buffer_head *bh = NULL;
struct ocfs2_slot_info *si;
- si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+ si = kzalloc(sizeof(struct ocfs2_slot_info) +
+ (sizeof(struct ocfs2_slot) * osb->max_slots),
+ GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
- spin_lock_init(&si->si_lock);
+ si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
- si->si_size = OCFS2_MAX_SLOTS;
-
- for(i = 0; i < si->si_num_slots; i++)
- si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
+ si->si_slots = (struct ocfs2_slot *)((char *)si +
+ sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -185,61 +445,53 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
+ si->si_inode = inode;
+ status = ocfs2_map_slot_buffers(osb, si);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- si->si_inode = inode;
- si->si_bh = bh;
- osb->slot_info = si;
+ osb->slot_info = (struct ocfs2_slot_info *)si;
bail:
if (status < 0 && si)
- ocfs2_free_slot_info(si);
+ __ocfs2_free_slot_info(si);
return status;
}
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+void ocfs2_free_slot_info(struct ocfs2_super *osb)
{
- if (si->si_inode)
- iput(si->si_inode);
- if (si->si_bh)
- brelse(si->si_bh);
- kfree(si);
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ osb->slot_info = NULL;
+ __ocfs2_free_slot_info(si);
}
int ocfs2_find_slot(struct ocfs2_super *osb)
{
int status;
- s16 slot;
+ int slot;
struct ocfs2_slot_info *si;
mlog_entry_void();
si = osb->slot_info;
+ spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- spin_lock(&si->si_lock);
/* search for ourselves first and take the slot if it already
* exists. Perhaps we need to mark this in a variable for our
* own journal recovery? Possibly not, though we certainly
* need to warn to the user */
slot = __ocfs2_node_num_to_slot(si, osb->node_num);
- if (slot == OCFS2_INVALID_SLOT) {
+ if (slot < 0) {
/* if no slot yet, then just take 1st available
* one. */
slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
- if (slot == OCFS2_INVALID_SLOT) {
- spin_unlock(&si->si_lock);
+ if (slot < 0) {
+ spin_unlock(&osb->osb_lock);
mlog(ML_ERROR, "no free slots available!\n");
status = -EINVAL;
goto bail;
@@ -248,13 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
slot);
- __ocfs2_fill_slot(si, slot, osb->node_num);
+ ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);
mlog(0, "taking node slot %d\n", osb->slot_num);
- status = ocfs2_update_disk_slots(osb, si);
+ status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
if (status < 0)
mlog_errno(status);
@@ -265,27 +517,27 @@ bail:
void ocfs2_put_slot(struct ocfs2_super *osb)
{
- int status;
+ int status, slot_num;
struct ocfs2_slot_info *si = osb->slot_info;
if (!si)
return;
+ spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- spin_lock(&si->si_lock);
- __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+ slot_num = osb->slot_num;
+ ocfs2_invalidate_slot(si, osb->slot_num);
osb->slot_num = OCFS2_INVALID_SLOT;
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);
- status = ocfs2_update_disk_slots(osb, si);
+ status = ocfs2_update_disk_slot(osb, si, slot_num);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bail:
- osb->slot_info = NULL;
- ocfs2_free_slot_info(si);
+ ocfs2_free_slot_info(osb);
}
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 1025872aaad..601c95fd700 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -27,38 +27,18 @@
#ifndef SLOTMAP_H
#define SLOTMAP_H
-struct ocfs2_slot_info {
- spinlock_t si_lock;
-
- struct inode *si_inode;
- struct buffer_head *si_bh;
- unsigned int si_num_slots;
- unsigned int si_size;
- s16 si_global_node_nums[OCFS2_MAX_SLOTS];
-};
-
int ocfs2_init_slot_info(struct ocfs2_super *osb);
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+void ocfs2_free_slot_info(struct ocfs2_super *osb);
int ocfs2_find_slot(struct ocfs2_super *osb);
void ocfs2_put_slot(struct ocfs2_super *osb);
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
- struct ocfs2_slot_info *si);
-
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global);
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
- s16 slot_num);
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
-static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
- int slot_num)
-{
- BUG_ON(slot_num == OCFS2_INVALID_SLOT);
- assert_spin_locked(&si->si_lock);
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+ unsigned int *node_num);
- return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
-}
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
#endif
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 00000000000..ac1d74c63bf
--- /dev/null
+++ b/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,420 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_o2cb.c
+ *
+ * Code which interfaces ocfs2 with the o2cb stack.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/crc32.h>
+#include <linux/module.h>
+
+/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
+#include <linux/fs.h>
+
+#include "cluster/masklog.h"
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+
+#include "stackglue.h"
+
+struct o2dlm_private {
+ struct dlm_eviction_cb op_eviction_cb;
+};
+
+static struct ocfs2_stack_plugin o2cb_stack;
+
+/* These should be identical */
+#if (DLM_LOCK_IV != LKM_IVMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_NL != LKM_NLMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CR != LKM_CRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CW != LKM_CWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PR != LKM_PRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PW != LKM_PWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_EX != LKM_EXMODE)
+# error Lock modes do not match
+#endif
+static inline int mode_to_o2dlm(int mode)
+{
+ BUG_ON(mode > LKM_MAXMODE);
+
+ return mode;
+}
+
+#define map_flag(_generic, _o2dlm) \
+ if (flags & (_generic)) { \
+ flags &= ~(_generic); \
+ o2dlm_flags |= (_o2dlm); \
+ }
+static int flags_to_o2dlm(u32 flags)
+{
+ int o2dlm_flags = 0;
+
+ map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
+ map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
+ map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
+ map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
+ map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
+ map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
+ map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
+ map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
+ map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
+
+ /* map_flag() should have cleared every flag passed in */
+ BUG_ON(flags != 0);
+
+ return o2dlm_flags;
+}
+#undef map_flag
+
+/*
+ * Map an o2dlm status to standard errno values.
+ *
+ * o2dlm only uses a handful of these, and returns even fewer to the
+ * caller. Still, we try to assign sane values to each error.
+ *
+ * The following value pairs have special meanings to dlmglue, thus
+ * the right hand side needs to stay unique - never duplicate the
+ * mapping elsewhere in the table!
+ *
+ * DLM_NORMAL: 0
+ * DLM_NOTQUEUED: -EAGAIN
+ * DLM_CANCELGRANT: -EBUSY
+ * DLM_CANCEL: -DLM_ECANCEL
+ */
+/* Keep in sync with dlmapi.h */
+static int status_map[] = {
+ [DLM_NORMAL] = 0, /* Success */
+ [DLM_GRANTED] = -EINVAL,
+ [DLM_DENIED] = -EACCES,
+ [DLM_DENIED_NOLOCKS] = -EACCES,
+ [DLM_WORKING] = -EACCES,
+ [DLM_BLOCKED] = -EINVAL,
+ [DLM_BLOCKED_ORPHAN] = -EINVAL,
+ [DLM_DENIED_GRACE_PERIOD] = -EACCES,
+ [DLM_SYSERR] = -ENOMEM, /* It is what it is */
+ [DLM_NOSUPPORT] = -EPROTO,
+ [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */
+ [DLM_IVLOCKID] = -EINVAL,
+ [DLM_SYNC] = -EINVAL,
+ [DLM_BADTYPE] = -EINVAL,
+ [DLM_BADRESOURCE] = -EINVAL,
+ [DLM_MAXHANDLES] = -ENOMEM,
+ [DLM_NOCLINFO] = -EINVAL,
+ [DLM_NOLOCKMGR] = -EINVAL,
+ [DLM_NOPURGED] = -EINVAL,
+ [DLM_BADARGS] = -EINVAL,
+ [DLM_VOID] = -EINVAL,
+ [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */
+ [DLM_IVBUFLEN] = -EINVAL,
+ [DLM_CVTUNGRANT] = -EPERM,
+ [DLM_BADPARAM] = -EINVAL,
+ [DLM_VALNOTVALID] = -EINVAL,
+ [DLM_REJECTED] = -EPERM,
+ [DLM_ABORT] = -EINVAL,
+ [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */
+ [DLM_IVRESHANDLE] = -EINVAL,
+ [DLM_DEADLOCK] = -EDEADLK,
+ [DLM_DENIED_NOASTS] = -EINVAL,
+ [DLM_FORWARD] = -EINVAL,
+ [DLM_TIMEOUT] = -ETIMEDOUT,
+ [DLM_IVGROUPID] = -EINVAL,
+ [DLM_VERS_CONFLICT] = -EOPNOTSUPP,
+ [DLM_BAD_DEVICE_PATH] = -ENOENT,
+ [DLM_NO_DEVICE_PERMISSION] = -EPERM,
+ [DLM_NO_CONTROL_DEVICE] = -ENOENT,
+ [DLM_RECOVERING] = -ENOTCONN,
+ [DLM_MIGRATING] = -ERESTART,
+ [DLM_MAXSTATS] = -EINVAL,
+};
+
+static int dlm_status_to_errno(enum dlm_status status)
+{
+ BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+
+ return status_map[status];
+}
+
+static void o2dlm_lock_ast_wrapper(void *astarg)
+{
+ BUG_ON(o2cb_stack.sp_proto == NULL);
+
+ o2cb_stack.sp_proto->lp_lock_ast(astarg);
+}
+
+static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
+{
+ BUG_ON(o2cb_stack.sp_proto == NULL);
+
+ o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
+}
+
+static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
+{
+ int error = dlm_status_to_errno(status);
+
+ BUG_ON(o2cb_stack.sp_proto == NULL);
+
+ /*
+ * In o2dlm, you can get both the lock_ast() for the lock being
+ * granted and the unlock_ast() for the CANCEL failing. A
+ * successful cancel sends DLM_NORMAL here. If the
+ * lock grant happened before the cancel arrived, you get
+ * DLM_CANCELGRANT.
+ *
+ * There's no need for the double-ast. If we see DLM_CANCELGRANT,
+ * we just ignore it. We expect the lock_ast() to handle the
+ * granted lock.
+ */
+ if (status == DLM_CANCELGRANT)
+ return;
+
+ o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
+}
+
+static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
+ int mode,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen,
+ void *astarg)
+{
+ enum dlm_status status;
+ int o2dlm_mode = mode_to_o2dlm(mode);
+ int o2dlm_flags = flags_to_o2dlm(flags);
+ int ret;
+
+ status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
+ o2dlm_flags, name, namelen,
+ o2dlm_lock_ast_wrapper, astarg,
+ o2dlm_blocking_ast_wrapper);
+ ret = dlm_status_to_errno(status);
+ return ret;
+}
+
+static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *astarg)
+{
+ enum dlm_status status;
+ int o2dlm_flags = flags_to_o2dlm(flags);
+ int ret;
+
+ status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
+ o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
+ ret = dlm_status_to_errno(status);
+ return ret;
+}
+
+static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+ return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+}
+
+static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+ return (void *)(lksb->lksb_o2dlm.lvb);
+}
+
+static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+ dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+}
+
+/*
+ * Called from the dlm when it's about to evict a node. This is how the
+ * classic stack signals node death.
+ */
+static void o2dlm_eviction_cb(int node_num, void *data)
+{
+ struct ocfs2_cluster_connection *conn = data;
+
+ mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
+ node_num, conn->cc_namelen, conn->cc_name);
+
+ conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
+}
+
+static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+ int rc = 0;
+ u32 dlm_key;
+ struct dlm_ctxt *dlm;
+ struct o2dlm_private *priv;
+ struct dlm_protocol_version dlm_version;
+
+ BUG_ON(conn == NULL);
+ BUG_ON(o2cb_stack.sp_proto == NULL);
+
+ /* for now we only have one cluster/node, make sure we see it
+ * in the heartbeat universe */
+ if (!o2hb_check_local_node_heartbeating()) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+ if (!priv) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+
+ /* This just fills the structure in. It is safe to pass conn. */
+ dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+ conn);
+
+ conn->cc_private = priv;
+
+ /* used by the dlm code to make message headers unique, each
+ * node in this domain must agree on this. */
+ dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
+ dlm_version.pv_major = conn->cc_version.pv_major;
+ dlm_version.pv_minor = conn->cc_version.pv_minor;
+
+ dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
+ if (IS_ERR(dlm)) {
+ rc = PTR_ERR(dlm);
+ mlog_errno(rc);
+ goto out_free;
+ }
+
+ conn->cc_version.pv_major = dlm_version.pv_major;
+ conn->cc_version.pv_minor = dlm_version.pv_minor;
+ conn->cc_lockspace = dlm;
+
+ dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+
+out_free:
+ if (rc && conn->cc_private)
+ kfree(conn->cc_private);
+
+out:
+ return rc;
+}
+
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+ int hangup_pending)
+{
+ struct dlm_ctxt *dlm = conn->cc_lockspace;
+ struct o2dlm_private *priv = conn->cc_private;
+
+ dlm_unregister_eviction_cb(&priv->op_eviction_cb);
+ conn->cc_private = NULL;
+ kfree(priv);
+
+ dlm_unregister_domain(dlm);
+ conn->cc_lockspace = NULL;
+
+ return 0;
+}
+
+static void o2hb_stop(const char *group)
+{
+ int ret;
+ char *argv[5], *envp[3];
+
+ argv[0] = (char *)o2nm_get_hb_ctl_path();
+ argv[1] = "-K";
+ argv[2] = "-u";
+ argv[3] = (char *)group;
+ argv[4] = NULL;
+
+ mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+
+ /* minimal command environment taken from cpu_run_sbin_hotplug */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ if (ret < 0)
+ mlog_errno(ret);
+}
+
+/*
+ * Hangup is a hack for tools compatibility. Older ocfs2-tools software
+ * expects the filesystem to call "ocfs2_hb_ctl" during unmount. This
+ * happens regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect(). We bring the o2hb_stop() function into
+ * the glue and provide a "hangup" API for super.c to call.
+ *
+ * Other stacks will eventually provide a NULL ->hangup() pointer.
+ */
+static void o2cb_cluster_hangup(const char *group, int grouplen)
+{
+ o2hb_stop(group);
+}
+
+static int o2cb_cluster_this_node(unsigned int *node)
+{
+ int node_num;
+
+ node_num = o2nm_this_node();
+ if (node_num == O2NM_INVALID_NODE_NUM)
+ return -ENOENT;
+
+ if (node_num >= O2NM_MAX_NODES)
+ return -EOVERFLOW;
+
+ *node = node_num;
+ return 0;
+}
+
+struct ocfs2_stack_operations o2cb_stack_ops = {
+ .connect = o2cb_cluster_connect,
+ .disconnect = o2cb_cluster_disconnect,
+ .hangup = o2cb_cluster_hangup,
+ .this_node = o2cb_cluster_this_node,
+ .dlm_lock = o2cb_dlm_lock,
+ .dlm_unlock = o2cb_dlm_unlock,
+ .lock_status = o2cb_dlm_lock_status,
+ .lock_lvb = o2cb_dlm_lvb,
+ .dump_lksb = o2cb_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin o2cb_stack = {
+ .sp_name = "o2cb",
+ .sp_ops = &o2cb_stack_ops,
+ .sp_owner = THIS_MODULE,
+};
+
+static int __init o2cb_stack_init(void)
+{
+ return ocfs2_stack_glue_register(&o2cb_stack);
+}
+
+static void __exit o2cb_stack_exit(void)
+{
+ ocfs2_stack_glue_unregister(&o2cb_stack);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
+MODULE_LICENSE("GPL");
+module_init(o2cb_stack_init);
+module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644
index 00000000000..7428663f9cb
--- /dev/null
+++ b/fs/ocfs2/stack_user.c
@@ -0,0 +1,883 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_user.c
+ *
+ * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/reboot.h>
+#include <asm/uaccess.h>
+
+#include "ocfs2.h" /* For struct ocfs2_lock_res */
+#include "stackglue.h"
+
+
+/*
+ * The control protocol starts with a handshake. Until the handshake
+ * is complete, the control device will fail all write(2)s.
+ *
+ * The handshake is simple. First, the client reads until EOF. Each line
+ * of output is a supported protocol tag. All protocol tags are a single
+ * character followed by a two hex digit version number. Currently the
+ * only things supported is T01, for "Text-base version 0x01". Next, the
+ * client writes the version they would like to use, including the newline.
+ * Thus, the protocol tag is 'T01\n'. If the version tag written is
+ * unknown, -EINVAL is returned. Once the negotiation is complete, the
+ * client can start sending messages.
+ *
+ * The T01 protocol has three messages. First is the "SETN" message.
+ * It has the following syntax:
+ *
+ * SETN<space><8-char-hex-nodenum><newline>
+ *
+ * This is 14 characters.
+ *
+ * The "SETN" message must be the first message following the protocol.
+ * It tells ocfs2_control the local node number.
+ *
+ * Next comes the "SETV" message. It has the following syntax:
+ *
+ * SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
+ *
+ * This is 11 characters.
+ *
+ * The "SETV" message sets the filesystem locking protocol version as
+ * negotiated by the client. The client negotiates based on the maximum
+ * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
+ * number from the "SETV" message must match
+ * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
+ * must be less than or equal to ...->lp_max_version.pv_minor.
+ *
+ * Once this information has been set, mounts will be allowed. From this
+ * point on, the "DOWN" message can be sent for node down notification.
+ * It has the following syntax:
+ *
+ * DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
+ *
+ * eg:
+ *
+ * DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
+ *
+ * This is 47 characters.
+ */
+
+/*
+ * Whether or not the client has done the handshake.
+ * For now, we have just one protocol version.
+ */
+#define OCFS2_CONTROL_PROTO "T01\n"
+#define OCFS2_CONTROL_PROTO_LEN 4
+
+/* Handshake states */
+#define OCFS2_CONTROL_HANDSHAKE_INVALID (0)
+#define OCFS2_CONTROL_HANDSHAKE_READ (1)
+#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL (2)
+#define OCFS2_CONTROL_HANDSHAKE_VALID (3)
+
+/* Messages */
+#define OCFS2_CONTROL_MESSAGE_OP_LEN 4
+#define OCFS2_CONTROL_MESSAGE_SETNODE_OP "SETN"
+#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP "SETV"
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN 11
+#define OCFS2_CONTROL_MESSAGE_DOWN_OP "DOWN"
+#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN 47
+#define OCFS2_TEXT_UUID_LEN 32
+#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
+#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
+
+/*
+ * ocfs2_live_connection is refcounted because the filesystem and
+ * miscdevice sides can detach in different order. Let's just be safe.
+ */
+struct ocfs2_live_connection {
+ struct list_head oc_list;
+ struct ocfs2_cluster_connection *oc_conn;
+};
+
+struct ocfs2_control_private {
+ struct list_head op_list;
+ int op_state;
+ int op_this_node;
+ struct ocfs2_protocol_version op_proto;
+};
+
+/* SETN<space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_setn {
+ char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+ char space;
+ char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+ char newline;
+};
+
+/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
+struct ocfs2_control_message_setv {
+ char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+ char space1;
+ char major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+ char space2;
+ char minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+ char newline;
+};
+
+/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_down {
+ char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+ char space1;
+ char uuid[OCFS2_TEXT_UUID_LEN];
+ char space2;
+ char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+ char newline;
+};
+
+union ocfs2_control_message {
+ char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+ struct ocfs2_control_message_setn u_setn;
+ struct ocfs2_control_message_setv u_setv;
+ struct ocfs2_control_message_down u_down;
+};
+
+static struct ocfs2_stack_plugin user_stack;
+
+static atomic_t ocfs2_control_opened;
+static int ocfs2_control_this_node = -1;
+static struct ocfs2_protocol_version running_proto;
+
+static LIST_HEAD(ocfs2_live_connection_list);
+static LIST_HEAD(ocfs2_control_private_list);
+static DEFINE_MUTEX(ocfs2_control_lock);
+
+static inline void ocfs2_control_set_handshake_state(struct file *file,
+ int state)
+{
+ struct ocfs2_control_private *p = file->private_data;
+ p->op_state = state;
+}
+
+static inline int ocfs2_control_get_handshake_state(struct file *file)
+{
+ struct ocfs2_control_private *p = file->private_data;
+ return p->op_state;
+}
+
+static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
+{
+ size_t len = strlen(name);
+ struct ocfs2_live_connection *c;
+
+ BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
+
+ list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
+ if ((c->oc_conn->cc_namelen == len) &&
+ !strncmp(c->oc_conn->cc_name, name, len))
+ return c;
+ }
+
+ return c;
+}
+
+/*
+ * ocfs2_live_connection structures are created underneath the ocfs2
+ * mount path. Since the VFS prevents multiple calls to
+ * fill_super(), we can't get dupes here.
+ */
+static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_live_connection **c_ret)
+{
+ int rc = 0;
+ struct ocfs2_live_connection *c;
+
+ c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+ if (!c)
+ return -ENOMEM;
+
+ mutex_lock(&ocfs2_control_lock);
+ c->oc_conn = conn;
+
+ if (atomic_read(&ocfs2_control_opened))
+ list_add(&c->oc_list, &ocfs2_live_connection_list);
+ else {
+ printk(KERN_ERR
+ "ocfs2: Userspace control daemon is not present\n");
+ rc = -ESRCH;
+ }
+
+ mutex_unlock(&ocfs2_control_lock);
+
+ if (!rc)
+ *c_ret = c;
+ else
+ kfree(c);
+
+ return rc;
+}
+
+/*
+ * This function disconnects the cluster connection from ocfs2_control.
+ * Afterwards, userspace can't affect the cluster connection.
+ */
+static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
+{
+ mutex_lock(&ocfs2_control_lock);
+ list_del_init(&c->oc_list);
+ c->oc_conn = NULL;
+ mutex_unlock(&ocfs2_control_lock);
+
+ kfree(c);
+}
+
+static int ocfs2_control_cfu(void *target, size_t target_len,
+ const char __user *buf, size_t count)
+{
+ /* The T01 expects write(2) calls to have exactly one command */
+ if ((count != target_len) ||
+ (count > sizeof(union ocfs2_control_message)))
+ return -EINVAL;
+
+ if (copy_from_user(target, buf, target_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static ssize_t ocfs2_control_validate_protocol(struct file *file,
+ const char __user *buf,
+ size_t count)
+{
+ ssize_t ret;
+ char kbuf[OCFS2_CONTROL_PROTO_LEN];
+
+ ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
+ buf, count);
+ if (ret)
+ return ret;
+
+ if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
+ return -EINVAL;
+
+ ocfs2_control_set_handshake_state(file,
+ OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+ return count;
+}
+
+static void ocfs2_control_send_down(const char *uuid,
+ int nodenum)
+{
+ struct ocfs2_live_connection *c;
+
+ mutex_lock(&ocfs2_control_lock);
+
+ c = ocfs2_connection_find(uuid);
+ if (c) {
+ BUG_ON(c->oc_conn == NULL);
+ c->oc_conn->cc_recovery_handler(nodenum,
+ c->oc_conn->cc_recovery_data);
+ }
+
+ mutex_unlock(&ocfs2_control_lock);
+}
+
+/*
+ * Called whenever configuration elements are sent to /dev/ocfs2_control.
+ * If all configuration elements are present, try to set the global
+ * values. If there is a problem, return an error. Skip any missing
+ * elements, and only bump ocfs2_control_opened when we have all elements
+ * and are successful.
+ */
+static int ocfs2_control_install_private(struct file *file)
+{
+ int rc = 0;
+ int set_p = 1;
+ struct ocfs2_control_private *p = file->private_data;
+
+ BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+ mutex_lock(&ocfs2_control_lock);
+
+ if (p->op_this_node < 0) {
+ set_p = 0;
+ } else if ((ocfs2_control_this_node >= 0) &&
+ (ocfs2_control_this_node != p->op_this_node)) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (!p->op_proto.pv_major) {
+ set_p = 0;
+ } else if (!list_empty(&ocfs2_live_connection_list) &&
+ ((running_proto.pv_major != p->op_proto.pv_major) ||
+ (running_proto.pv_minor != p->op_proto.pv_minor))) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (set_p) {
+ ocfs2_control_this_node = p->op_this_node;
+ running_proto.pv_major = p->op_proto.pv_major;
+ running_proto.pv_minor = p->op_proto.pv_minor;
+ }
+
+out_unlock:
+ mutex_unlock(&ocfs2_control_lock);
+
+ if (!rc && set_p) {
+ /* We set the global values successfully */
+ atomic_inc(&ocfs2_control_opened);
+ ocfs2_control_set_handshake_state(file,
+ OCFS2_CONTROL_HANDSHAKE_VALID);
+ }
+
+ return rc;
+}
+
+static int ocfs2_control_get_this_node(void)
+{
+ int rc;
+
+ mutex_lock(&ocfs2_control_lock);
+ if (ocfs2_control_this_node < 0)
+ rc = -EINVAL;
+ else
+ rc = ocfs2_control_this_node;
+ mutex_unlock(&ocfs2_control_lock);
+
+ return rc;
+}
+
+static int ocfs2_control_do_setnode_msg(struct file *file,
+ struct ocfs2_control_message_setn *msg)
+{
+ long nodenum;
+ char *ptr = NULL;
+ struct ocfs2_control_private *p = file->private_data;
+
+ if (ocfs2_control_get_handshake_state(file) !=
+ OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+ return -EINVAL;
+
+ if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ return -EINVAL;
+
+ if ((msg->space != ' ') || (msg->newline != '\n'))
+ return -EINVAL;
+ msg->space = msg->newline = '\0';
+
+ nodenum = simple_strtol(msg->nodestr, &ptr, 16);
+ if (!ptr || *ptr)
+ return -EINVAL;
+
+ if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+ (nodenum > INT_MAX) || (nodenum < 0))
+ return -ERANGE;
+ p->op_this_node = nodenum;
+
+ return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_setversion_msg(struct file *file,
+ struct ocfs2_control_message_setv *msg)
+ {
+ long major, minor;
+ char *ptr = NULL;
+ struct ocfs2_control_private *p = file->private_data;
+ struct ocfs2_protocol_version *max =
+ &user_stack.sp_proto->lp_max_version;
+
+ if (ocfs2_control_get_handshake_state(file) !=
+ OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+ return -EINVAL;
+
+ if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ return -EINVAL;
+
+ if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+ (msg->newline != '\n'))
+ return -EINVAL;
+ msg->space1 = msg->space2 = msg->newline = '\0';
+
+ major = simple_strtol(msg->major, &ptr, 16);
+ if (!ptr || *ptr)
+ return -EINVAL;
+ minor = simple_strtol(msg->minor, &ptr, 16);
+ if (!ptr || *ptr)
+ return -EINVAL;
+
+ /*
+ * The major must be between 1 and 255, inclusive. The minor
+ * must be between 0 and 255, inclusive. The version passed in
+ * must be within the maximum version supported by the filesystem.
+ */
+ if ((major == LONG_MIN) || (major == LONG_MAX) ||
+ (major > (u8)-1) || (major < 1))
+ return -ERANGE;
+ if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
+ (minor > (u8)-1) || (minor < 0))
+ return -ERANGE;
+ if ((major != max->pv_major) ||
+ (minor > max->pv_minor))
+ return -EINVAL;
+
+ p->op_proto.pv_major = major;
+ p->op_proto.pv_minor = minor;
+
+ return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_down_msg(struct file *file,
+ struct ocfs2_control_message_down *msg)
+{
+ long nodenum;
+ char *p = NULL;
+
+ if (ocfs2_control_get_handshake_state(file) !=
+ OCFS2_CONTROL_HANDSHAKE_VALID)
+ return -EINVAL;
+
+ if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ return -EINVAL;
+
+ if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+ (msg->newline != '\n'))
+ return -EINVAL;
+ msg->space1 = msg->space2 = msg->newline = '\0';
+
+ nodenum = simple_strtol(msg->nodestr, &p, 16);
+ if (!p || *p)
+ return -EINVAL;
+
+ if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+ (nodenum > INT_MAX) || (nodenum < 0))
+ return -ERANGE;
+
+ ocfs2_control_send_down(msg->uuid, nodenum);
+
+ return 0;
+}
+
+static ssize_t ocfs2_control_message(struct file *file,
+ const char __user *buf,
+ size_t count)
+{
+ ssize_t ret;
+ union ocfs2_control_message msg;
+
+ /* Try to catch padding issues */
+ WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
+ (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
+
+ memset(&msg, 0, sizeof(union ocfs2_control_message));
+ ret = ocfs2_control_cfu(&msg, count, buf, count);
+ if (ret)
+ goto out;
+
+ if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
+ !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
+ else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
+ !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
+ else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
+ !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ ret = ocfs2_control_do_down_msg(file, &msg.u_down);
+ else
+ ret = -EINVAL;
+
+out:
+ return ret ? ret : count;
+}
+
+static ssize_t ocfs2_control_write(struct file *file,
+ const char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+ ssize_t ret;
+
+ switch (ocfs2_control_get_handshake_state(file)) {
+ case OCFS2_CONTROL_HANDSHAKE_INVALID:
+ ret = -EINVAL;
+ break;
+
+ case OCFS2_CONTROL_HANDSHAKE_READ:
+ ret = ocfs2_control_validate_protocol(file, buf,
+ count);
+ break;
+
+ case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
+ case OCFS2_CONTROL_HANDSHAKE_VALID:
+ ret = ocfs2_control_message(file, buf, count);
+ break;
+
+ default:
+ BUG();
+ ret = -EIO;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This is a naive version. If we ever have a new protocol, we'll expand
+ * it. Probably using seq_file.
+ */
+static ssize_t ocfs2_control_read(struct file *file,
+ char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+ char *proto_string = OCFS2_CONTROL_PROTO;
+ size_t to_write = 0;
+
+ if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+ return 0;
+
+ to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
+ if (to_write > count)
+ to_write = count;
+ if (copy_to_user(buf, proto_string + *ppos, to_write))
+ return -EFAULT;
+
+ *ppos += to_write;
+
+ /* Have we read the whole protocol list? */
+ if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+ ocfs2_control_set_handshake_state(file,
+ OCFS2_CONTROL_HANDSHAKE_READ);
+
+ return to_write;
+}
+
+static int ocfs2_control_release(struct inode *inode, struct file *file)
+{
+ struct ocfs2_control_private *p = file->private_data;
+
+ mutex_lock(&ocfs2_control_lock);
+
+ if (ocfs2_control_get_handshake_state(file) !=
+ OCFS2_CONTROL_HANDSHAKE_VALID)
+ goto out;
+
+ if (atomic_dec_and_test(&ocfs2_control_opened)) {
+ if (!list_empty(&ocfs2_live_connection_list)) {
+ /* XXX: Do bad things! */
+ printk(KERN_ERR
+ "ocfs2: Unexpected release of ocfs2_control!\n"
+ " Loss of cluster connection requires "
+ "an emergency restart!\n");
+ emergency_restart();
+ }
+ /*
+ * Last valid close clears the node number and resets
+ * the locking protocol version
+ */
+ ocfs2_control_this_node = -1;
+ running_proto.pv_major = 0;
+ running_proto.pv_major = 0;
+ }
+
+out:
+ list_del_init(&p->op_list);
+ file->private_data = NULL;
+
+ mutex_unlock(&ocfs2_control_lock);
+
+ kfree(p);
+
+ return 0;
+}
+
+static int ocfs2_control_open(struct inode *inode, struct file *file)
+{
+ struct ocfs2_control_private *p;
+
+ p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ p->op_this_node = -1;
+
+ mutex_lock(&ocfs2_control_lock);
+ file->private_data = p;
+ list_add(&p->op_list, &ocfs2_control_private_list);
+ mutex_unlock(&ocfs2_control_lock);
+
+ return 0;
+}
+
+static const struct file_operations ocfs2_control_fops = {
+ .open = ocfs2_control_open,
+ .release = ocfs2_control_release,
+ .read = ocfs2_control_read,
+ .write = ocfs2_control_write,
+ .owner = THIS_MODULE,
+};
+
+struct miscdevice ocfs2_control_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "ocfs2_control",
+ .fops = &ocfs2_control_fops,
+};
+
+static int ocfs2_control_init(void)
+{
+ int rc;
+
+ atomic_set(&ocfs2_control_opened, 0);
+
+ rc = misc_register(&ocfs2_control_device);
+ if (rc)
+ printk(KERN_ERR
+ "ocfs2: Unable to register ocfs2_control device "
+ "(errno %d)\n",
+ -rc);
+
+ return rc;
+}
+
+static void ocfs2_control_exit(void)
+{
+ int rc;
+
+ rc = misc_deregister(&ocfs2_control_device);
+ if (rc)
+ printk(KERN_ERR
+ "ocfs2: Unable to deregister ocfs2_control device "
+ "(errno %d)\n",
+ -rc);
+}
+
+static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
+{
+ struct ocfs2_lock_res *res = astarg;
+ return &res->l_lksb.lksb_fsdlm;
+}
+
+static void fsdlm_lock_ast_wrapper(void *astarg)
+{
+ struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
+ int status = lksb->sb_status;
+
+ BUG_ON(user_stack.sp_proto == NULL);
+
+ /*
+ * For now we're punting on the issue of other non-standard errors
+ * where we can't tell if the unlock_ast or lock_ast should be called.
+ * The main "other error" that's possible is EINVAL which means the
+ * function was called with invalid args, which shouldn't be possible
+ * since the caller here is under our control. Other non-standard
+ * errors probably fall into the same category, or otherwise are fatal
+ * which means we can't carry on anyway.
+ */
+
+ if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
+ user_stack.sp_proto->lp_unlock_ast(astarg, 0);
+ else
+ user_stack.sp_proto->lp_lock_ast(astarg);
+}
+
+static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
+{
+ BUG_ON(user_stack.sp_proto == NULL);
+
+ user_stack.sp_proto->lp_blocking_ast(astarg, level);
+}
+
+static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
+ int mode,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen,
+ void *astarg)
+{
+ int ret;
+
+ if (!lksb->lksb_fsdlm.sb_lvbptr)
+ lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+ sizeof(struct dlm_lksb);
+
+ ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+ flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+ fsdlm_lock_ast_wrapper, astarg,
+ fsdlm_blocking_ast_wrapper);
+ return ret;
+}
+
+static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *astarg)
+{
+ int ret;
+
+ ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+ flags, &lksb->lksb_fsdlm, astarg);
+ return ret;
+}
+
+static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+ return lksb->lksb_fsdlm.sb_status;
+}
+
+static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+ return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
+}
+
+static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+}
+
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
+ struct ocfs2_protocol_version *request)
+{
+ if (existing->pv_major != request->pv_major)
+ return 1;
+
+ if (existing->pv_minor > request->pv_minor)
+ return 1;
+
+ if (existing->pv_minor < request->pv_minor)
+ request->pv_minor = existing->pv_minor;
+
+ return 0;
+}
+
+static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+ dlm_lockspace_t *fsdlm;
+ struct ocfs2_live_connection *control;
+ int rc = 0;
+
+ BUG_ON(conn == NULL);
+
+ rc = ocfs2_live_connection_new(conn, &control);
+ if (rc)
+ goto out;
+
+ /*
+ * running_proto must have been set before we allowed any mounts
+ * to proceed.
+ */
+ if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
+ printk(KERN_ERR
+ "Unable to mount with fs locking protocol version "
+ "%u.%u because the userspace control daemon has "
+ "negotiated %u.%u\n",
+ conn->cc_version.pv_major, conn->cc_version.pv_minor,
+ running_proto.pv_major, running_proto.pv_minor);
+ rc = -EPROTO;
+ ocfs2_live_connection_drop(control);
+ goto out;
+ }
+
+ rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
+ &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+ if (rc) {
+ ocfs2_live_connection_drop(control);
+ goto out;
+ }
+
+ conn->cc_private = control;
+ conn->cc_lockspace = fsdlm;
+out:
+ return rc;
+}
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+ int hangup_pending)
+{
+ dlm_release_lockspace(conn->cc_lockspace, 2);
+ conn->cc_lockspace = NULL;
+ ocfs2_live_connection_drop(conn->cc_private);
+ conn->cc_private = NULL;
+ return 0;
+}
+
+static int user_cluster_this_node(unsigned int *this_node)
+{
+ int rc;
+
+ rc = ocfs2_control_get_this_node();
+ if (rc < 0)
+ return rc;
+
+ *this_node = rc;
+ return 0;
+}
+
+static struct ocfs2_stack_operations user_stack_ops = {
+ .connect = user_cluster_connect,
+ .disconnect = user_cluster_disconnect,
+ .this_node = user_cluster_this_node,
+ .dlm_lock = user_dlm_lock,
+ .dlm_unlock = user_dlm_unlock,
+ .lock_status = user_dlm_lock_status,
+ .lock_lvb = user_dlm_lvb,
+ .dump_lksb = user_dlm_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin user_stack = {
+ .sp_name = "user",
+ .sp_ops = &user_stack_ops,
+ .sp_owner = THIS_MODULE,
+};
+
+
+static int __init user_stack_init(void)
+{
+ int rc;
+
+ rc = ocfs2_control_init();
+ if (!rc) {
+ rc = ocfs2_stack_glue_register(&user_stack);
+ if (rc)
+ ocfs2_control_exit();
+ }
+
+ return rc;
+}
+
+static void __exit user_stack_exit(void)
+{
+ ocfs2_stack_glue_unregister(&user_stack);
+ ocfs2_control_exit();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
+MODULE_LICENSE("GPL");
+module_init(user_stack_init);
+module_exit(user_stack_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644
index 00000000000..119f60cea9c
--- /dev/null
+++ b/fs/ocfs2/stackglue.c
@@ -0,0 +1,568 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.c
+ *
+ * Code which implements an OCFS2 specific interface to underlying
+ * cluster stacks.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include "ocfs2_fs.h"
+
+#include "stackglue.h"
+
+#define OCFS2_STACK_PLUGIN_O2CB "o2cb"
+#define OCFS2_STACK_PLUGIN_USER "user"
+
+static struct ocfs2_locking_protocol *lproto;
+static DEFINE_SPINLOCK(ocfs2_stack_lock);
+static LIST_HEAD(ocfs2_stack_list);
+static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+
+/*
+ * The stack currently in use. If not null, active_stack->sp_count > 0,
+ * the module is pinned, and the locking protocol cannot be changed.
+ */
+static struct ocfs2_stack_plugin *active_stack;
+
+static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
+{
+ struct ocfs2_stack_plugin *p;
+
+ assert_spin_locked(&ocfs2_stack_lock);
+
+ list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+ if (!strcmp(p->sp_name, name))
+ return p;
+ }
+
+ return NULL;
+}
+
+static int ocfs2_stack_driver_request(const char *stack_name,
+ const char *plugin_name)
+{
+ int rc;
+ struct ocfs2_stack_plugin *p;
+
+ spin_lock(&ocfs2_stack_lock);
+
+ /*
+ * If the stack passed by the filesystem isn't the selected one,
+ * we can't continue.
+ */
+ if (strcmp(stack_name, cluster_stack_name)) {
+ rc = -EBUSY;
+ goto out;
+ }
+
+ if (active_stack) {
+ /*
+ * If the active stack isn't the one we want, it cannot
+ * be selected right now.
+ */
+ if (!strcmp(active_stack->sp_name, plugin_name))
+ rc = 0;
+ else
+ rc = -EBUSY;
+ goto out;
+ }
+
+ p = ocfs2_stack_lookup(plugin_name);
+ if (!p || !try_module_get(p->sp_owner)) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ /* Ok, the stack is pinned */
+ p->sp_count++;
+ active_stack = p;
+
+ rc = 0;
+
+out:
+ spin_unlock(&ocfs2_stack_lock);
+ return rc;
+}
+
+/*
+ * This function looks up the appropriate stack and makes it active. If
+ * there is no stack, it tries to load it. It will fail if the stack still
+ * cannot be found. It will also fail if a different stack is in use.
+ */
+static int ocfs2_stack_driver_get(const char *stack_name)
+{
+ int rc;
+ char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
+
+ /*
+ * Classic stack does not pass in a stack name. This is
+ * compatible with older tools as well.
+ */
+ if (!stack_name || !*stack_name)
+ stack_name = OCFS2_STACK_PLUGIN_O2CB;
+
+ if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
+ printk(KERN_ERR
+ "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
+ stack_name);
+ return -EINVAL;
+ }
+
+ /* Anything that isn't the classic stack is a user stack */
+ if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
+ plugin_name = OCFS2_STACK_PLUGIN_USER;
+
+ rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+ if (rc == -ENOENT) {
+ request_module("ocfs2_stack_%s", plugin_name);
+ rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+ }
+
+ if (rc == -ENOENT) {
+ printk(KERN_ERR
+ "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
+ plugin_name);
+ } else if (rc == -EBUSY) {
+ printk(KERN_ERR
+ "ocfs2: A different cluster stack is in use\n");
+ }
+
+ return rc;
+}
+
+static void ocfs2_stack_driver_put(void)
+{
+ spin_lock(&ocfs2_stack_lock);
+ BUG_ON(active_stack == NULL);
+ BUG_ON(active_stack->sp_count == 0);
+
+ active_stack->sp_count--;
+ if (!active_stack->sp_count) {
+ module_put(active_stack->sp_owner);
+ active_stack = NULL;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+}
+
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
+{
+ int rc;
+
+ spin_lock(&ocfs2_stack_lock);
+ if (!ocfs2_stack_lookup(plugin->sp_name)) {
+ plugin->sp_count = 0;
+ plugin->sp_proto = lproto;
+ list_add(&plugin->sp_list, &ocfs2_stack_list);
+ printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
+ plugin->sp_name);
+ rc = 0;
+ } else {
+ printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
+ plugin->sp_name);
+ rc = -EEXIST;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
+
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
+{
+ struct ocfs2_stack_plugin *p;
+
+ spin_lock(&ocfs2_stack_lock);
+ p = ocfs2_stack_lookup(plugin->sp_name);
+ if (p) {
+ BUG_ON(p != plugin);
+ BUG_ON(plugin == active_stack);
+ BUG_ON(plugin->sp_count != 0);
+ list_del_init(&plugin->sp_list);
+ printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
+ plugin->sp_name);
+ } else {
+ printk(KERN_ERR "Stack \"%s\" is not registered\n",
+ plugin->sp_name);
+ }
+ spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
+
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+{
+ struct ocfs2_stack_plugin *p;
+
+ BUG_ON(proto == NULL);
+
+ spin_lock(&ocfs2_stack_lock);
+ BUG_ON(active_stack != NULL);
+
+ lproto = proto;
+ list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+ p->sp_proto = lproto;
+ }
+
+ spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
+
+
+/*
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
+ * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
+ * underlying stack plugins need to pilfer the lksb off of the lock_res.
+ * If some other structure needs to be passed as an astarg, the plugins
+ * will need to be given a different avenue to the lksb.
+ */
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+ int mode,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen,
+ struct ocfs2_lock_res *astarg)
+{
+ BUG_ON(lproto == NULL);
+
+ return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
+ name, namelen, astarg);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
+
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ struct ocfs2_lock_res *astarg)
+{
+ BUG_ON(lproto == NULL);
+
+ return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
+
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+ return active_stack->sp_ops->lock_status(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
+
+/*
+ * Why don't we cast to ocfs2_meta_lvb? The "clean" answer is that we
+ * don't cast at the glue level. The real answer is that the header
+ * ordering is nigh impossible.
+ */
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+ return active_stack->sp_ops->lock_lvb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
+
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+ active_stack->sp_ops->dump_lksb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+
+int ocfs2_cluster_connect(const char *stack_name,
+ const char *group,
+ int grouplen,
+ void (*recovery_handler)(int node_num,
+ void *recovery_data),
+ void *recovery_data,
+ struct ocfs2_cluster_connection **conn)
+{
+ int rc = 0;
+ struct ocfs2_cluster_connection *new_conn;
+
+ BUG_ON(group == NULL);
+ BUG_ON(conn == NULL);
+ BUG_ON(recovery_handler == NULL);
+
+ if (grouplen > GROUP_NAME_MAX) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
+ GFP_KERNEL);
+ if (!new_conn) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(new_conn->cc_name, group, grouplen);
+ new_conn->cc_namelen = grouplen;
+ new_conn->cc_recovery_handler = recovery_handler;
+ new_conn->cc_recovery_data = recovery_data;
+
+ /* Start the new connection at our maximum compatibility level */
+ new_conn->cc_version = lproto->lp_max_version;
+
+ /* This will pin the stack driver if successful */
+ rc = ocfs2_stack_driver_get(stack_name);
+ if (rc)
+ goto out_free;
+
+ rc = active_stack->sp_ops->connect(new_conn);
+ if (rc) {
+ ocfs2_stack_driver_put();
+ goto out_free;
+ }
+
+ *conn = new_conn;
+
+out_free:
+ if (rc)
+ kfree(new_conn);
+
+out:
+ return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
+
+/* If hangup_pending is 0, the stack driver will be dropped */
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+ int hangup_pending)
+{
+ int ret;
+
+ BUG_ON(conn == NULL);
+
+ ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
+
+ /* XXX Should we free it anyway? */
+ if (!ret) {
+ kfree(conn);
+ if (!hangup_pending)
+ ocfs2_stack_driver_put();
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
+
+void ocfs2_cluster_hangup(const char *group, int grouplen)
+{
+ BUG_ON(group == NULL);
+ BUG_ON(group[grouplen] != '\0');
+
+ if (active_stack->sp_ops->hangup)
+ active_stack->sp_ops->hangup(group, grouplen);
+
+ /* cluster_disconnect() was called with hangup_pending==1 */
+ ocfs2_stack_driver_put();
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
+
+int ocfs2_cluster_this_node(unsigned int *node)
+{
+ return active_stack->sp_ops->this_node(node);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
+
+
+/*
+ * Sysfs bits
+ */
+
+static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t ret = 0;
+
+ spin_lock(&ocfs2_stack_lock);
+ if (lproto)
+ ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
+ lproto->lp_max_version.pv_major,
+ lproto->lp_max_version.pv_minor);
+ spin_unlock(&ocfs2_stack_lock);
+
+ return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_max_locking_protocol =
+ __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+ ocfs2_max_locking_protocol_show, NULL);
+
+static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+ struct ocfs2_stack_plugin *p;
+
+ spin_lock(&ocfs2_stack_lock);
+ list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+ ret = snprintf(buf, remain, "%s\n",
+ p->sp_name);
+ if (ret < 0) {
+ total = ret;
+ break;
+ }
+ if (ret == remain) {
+ /* snprintf() didn't fit */
+ total = -E2BIG;
+ break;
+ }
+ total += ret;
+ remain -= ret;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+
+ return total;
+}
+
+static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
+ __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+ ocfs2_loaded_cluster_plugins_show, NULL);
+
+static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t ret = 0;
+
+ spin_lock(&ocfs2_stack_lock);
+ if (active_stack) {
+ ret = snprintf(buf, PAGE_SIZE, "%s\n",
+ active_stack->sp_name);
+ if (ret == PAGE_SIZE)
+ ret = -E2BIG;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+
+ return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
+ __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+ ocfs2_active_cluster_plugin_show, NULL);
+
+static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t ret;
+ spin_lock(&ocfs2_stack_lock);
+ ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
+ spin_unlock(&ocfs2_stack_lock);
+
+ return ret;
+}
+
+static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ size_t len = count;
+ ssize_t ret;
+
+ if (len == 0)
+ return len;
+
+ if (buf[len - 1] == '\n')
+ len--;
+
+ if ((len != OCFS2_STACK_LABEL_LEN) ||
+ (strnlen(buf, len) != len))
+ return -EINVAL;
+
+ spin_lock(&ocfs2_stack_lock);
+ if (active_stack) {
+ if (!strncmp(buf, cluster_stack_name, len))
+ ret = count;
+ else
+ ret = -EBUSY;
+ } else {
+ memcpy(cluster_stack_name, buf, len);
+ ret = count;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+
+ return ret;
+}
+
+
+static struct kobj_attribute ocfs2_attr_cluster_stack =
+ __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+ ocfs2_cluster_stack_show,
+ ocfs2_cluster_stack_store);
+
+static struct attribute *ocfs2_attrs[] = {
+ &ocfs2_attr_max_locking_protocol.attr,
+ &ocfs2_attr_loaded_cluster_plugins.attr,
+ &ocfs2_attr_active_cluster_plugin.attr,
+ &ocfs2_attr_cluster_stack.attr,
+ NULL,
+};
+
+static struct attribute_group ocfs2_attr_group = {
+ .attrs = ocfs2_attrs,
+};
+
+static struct kset *ocfs2_kset;
+
+static void ocfs2_sysfs_exit(void)
+{
+ kset_unregister(ocfs2_kset);
+}
+
+static int ocfs2_sysfs_init(void)
+{
+ int ret;
+
+ ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
+ if (!ocfs2_kset)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
+ if (ret)
+ goto error;
+
+ return 0;
+
+error:
+ kset_unregister(ocfs2_kset);
+ return ret;
+}
+
+static int __init ocfs2_stack_glue_init(void)
+{
+ strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+
+ return ocfs2_sysfs_init();
+}
+
+static void __exit ocfs2_stack_glue_exit(void)
+{
+ lproto = NULL;
+ ocfs2_sysfs_exit();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_stack_glue_init);
+module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644
index 00000000000..005e4f170e0
--- /dev/null
+++ b/fs/ocfs2/stackglue.h
@@ -0,0 +1,261 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.h
+ *
+ * Glue to the underlying cluster stack.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef STACKGLUE_H
+#define STACKGLUE_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/dlmconstants.h>
+
+#include "dlm/dlmapi.h"
+#include <linux/dlm.h>
+
+/*
+ * dlmconstants.h does not have a LOCAL flag. We hope to remove it
+ * some day, but right now we need it. Let's fake it. This value is larger
+ * than any flag in dlmconstants.h.
+ */
+#define DLM_LKF_LOCAL 0x00100000
+
+/*
+ * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably
+ * wants to be in a public header.
+ */
+#define GROUP_NAME_MAX 64
+
+
+/*
+ * ocfs2_protocol_version changes when ocfs2 does something different in
+ * its inter-node behavior. See dlmglue.c for more information.
+ */
+struct ocfs2_protocol_version {
+ u8 pv_major;
+ u8 pv_minor;
+};
+
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+ struct ocfs2_protocol_version lp_max_version;
+ void (*lp_lock_ast)(void *astarg);
+ void (*lp_blocking_ast)(void *astarg, int level);
+ void (*lp_unlock_ast)(void *astarg, int error);
+};
+
+
+/*
+ * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
+ * has a pointer to separately allocated lvb space. This struct exists only to
+ * include in the lksb union to make space for a combined dlm_lksb and lvb.
+ */
+struct fsdlm_lksb_plus_lvb {
+ struct dlm_lksb lksb;
+ char lvb[DLM_LVB_LEN];
+};
+
+/*
+ * A union of all lock status structures. We define it here so that the
+ * size of the union is known. Lock status structures are embedded in
+ * ocfs2 inodes.
+ */
+union ocfs2_dlm_lksb {
+ struct dlm_lockstatus lksb_o2dlm;
+ struct dlm_lksb lksb_fsdlm;
+ struct fsdlm_lksb_plus_lvb padding;
+};
+
+/*
+ * A cluster connection. Mostly opaque to ocfs2, the connection holds
+ * state for the underlying stack. ocfs2 does use cc_version to determine
+ * locking compatibility.
+ */
+struct ocfs2_cluster_connection {
+ char cc_name[GROUP_NAME_MAX];
+ int cc_namelen;
+ struct ocfs2_protocol_version cc_version;
+ void (*cc_recovery_handler)(int node_num, void *recovery_data);
+ void *cc_recovery_data;
+ void *cc_lockspace;
+ void *cc_private;
+};
+
+/*
+ * Each cluster stack implements the stack operations structure. Not used
+ * in the ocfs2 code, the stackglue code translates generic cluster calls
+ * into stack operations.
+ */
+struct ocfs2_stack_operations {
+ /*
+ * The fs code calls ocfs2_cluster_connect() to attach a new
+ * filesystem to the cluster stack. The ->connect() op is passed
+ * an ocfs2_cluster_connection with the name and recovery field
+ * filled in.
+ *
+ * The stack must set up any notification mechanisms and create
+ * the filesystem lockspace in the DLM. The lockspace should be
+ * stored on cc_lockspace. Any other information can be stored on
+ * cc_private.
+ *
+ * ->connect() must not return until it is guaranteed that
+ *
+ * - Node down notifications for the filesystem will be recieved
+ * and passed to conn->cc_recovery_handler().
+ * - Locking requests for the filesystem will be processed.
+ */
+ int (*connect)(struct ocfs2_cluster_connection *conn);
+
+ /*
+ * The fs code calls ocfs2_cluster_disconnect() when a filesystem
+ * no longer needs cluster services. All DLM locks have been
+ * dropped, and recovery notification is being ignored by the
+ * fs code. The stack must disengage from the DLM and discontinue
+ * recovery notification.
+ *
+ * Once ->disconnect() has returned, the connection structure will
+ * be freed. Thus, a stack must not return from ->disconnect()
+ * until it will no longer reference the conn pointer.
+ *
+ * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
+ * be dropping the reference on the module.
+ */
+ int (*disconnect)(struct ocfs2_cluster_connection *conn,
+ int hangup_pending);
+
+ /*
+ * ocfs2_cluster_hangup() exists for compatibility with older
+ * ocfs2 tools. Only the classic stack really needs it. As such
+ * ->hangup() is not required of all stacks. See the comment by
+ * ocfs2_cluster_hangup() for more details.
+ *
+ * Note that ocfs2_cluster_hangup() can only be called if
+ * hangup_pending was passed to ocfs2_cluster_disconnect().
+ */
+ void (*hangup)(const char *group, int grouplen);
+
+ /*
+ * ->this_node() returns the cluster's unique identifier for the
+ * local node.
+ */
+ int (*this_node)(unsigned int *node);
+
+ /*
+ * Call the underlying dlm lock function. The ->dlm_lock()
+ * callback should convert the flags and mode as appropriate.
+ *
+ * ast and bast functions are not part of the call because the
+ * stack will likely want to wrap ast and bast calls before passing
+ * them to stack->sp_proto.
+ */
+ int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
+ int mode,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen,
+ void *astarg);
+
+ /*
+ * Call the underlying dlm unlock function. The ->dlm_unlock()
+ * function should convert the flags as appropriate.
+ *
+ * The unlock ast is not passed, as the stack will want to wrap
+ * it before calling stack->sp_proto->lp_unlock_ast().
+ */
+ int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *astarg);
+
+ /*
+ * Return the status of the current lock status block. The fs
+ * code should never dereference the union. The ->lock_status()
+ * callback pulls out the stack-specific lksb, converts the status
+ * to a proper errno, and returns it.
+ */
+ int (*lock_status)(union ocfs2_dlm_lksb *lksb);
+
+ /*
+ * Pull the lvb pointer off of the stack-specific lksb.
+ */
+ void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
+
+ /*
+ * This is an optoinal debugging hook. If provided, the
+ * stack can dump debugging information about this lock.
+ */
+ void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
+};
+
+/*
+ * Each stack plugin must describe itself by registering a
+ * ocfs2_stack_plugin structure. This is only seen by stackglue and the
+ * stack driver.
+ */
+struct ocfs2_stack_plugin {
+ char *sp_name;
+ struct ocfs2_stack_operations *sp_ops;
+ struct module *sp_owner;
+
+ /* These are managed by the stackglue code. */
+ struct list_head sp_list;
+ unsigned int sp_count;
+ struct ocfs2_locking_protocol *sp_proto;
+};
+
+
+/* Used by the filesystem */
+int ocfs2_cluster_connect(const char *stack_name,
+ const char *group,
+ int grouplen,
+ void (*recovery_handler)(int node_num,
+ void *recovery_data),
+ void *recovery_data,
+ struct ocfs2_cluster_connection **conn);
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+ int hangup_pending);
+void ocfs2_cluster_hangup(const char *group, int grouplen);
+int ocfs2_cluster_this_node(unsigned int *node);
+
+struct ocfs2_lock_res;
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+ int mode,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen,
+ struct ocfs2_lock_res *astarg);
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+ union ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ struct ocfs2_lock_res *astarg);
+
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
+
+
+/* Used by stack plugins */
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 72c198a004d..d2d278fb981 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -46,6 +46,11 @@
#include "buffer_head_io.h"
+#define NOT_ALLOC_NEW_GROUP 0
+#define ALLOC_NEW_GROUP 1
+
+#define OCFS2_MAX_INODES_TO_STEAL 1024
+
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -106,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
u64 *bg_blkno,
u16 *bg_bit_off);
-void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
{
struct inode *inode = ac->ac_inode;
@@ -117,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
mutex_unlock(&inode->i_mutex);
iput(inode);
+ ac->ac_inode = NULL;
}
- if (ac->ac_bh)
+ if (ac->ac_bh) {
brelse(ac->ac_bh);
+ ac->ac_bh = NULL;
+ }
+}
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+ ocfs2_free_ac_resource(ac);
kfree(ac);
}
@@ -391,7 +404,8 @@ bail:
static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac,
int type,
- u32 slot)
+ u32 slot,
+ int alloc_new_group)
{
int status;
u32 bits_wanted = ac->ac_bits_wanted;
@@ -420,6 +434,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
}
ac->ac_inode = alloc_inode;
+ ac->ac_alloc_slot = slot;
fe = (struct ocfs2_dinode *) bh->b_data;
if (!OCFS2_IS_VALID_DINODE(fe)) {
@@ -446,6 +461,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
goto bail;
}
+ if (alloc_new_group != ALLOC_NEW_GROUP) {
+ mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
+ "and we don't alloc a new group for it.\n",
+ slot, bits_wanted, free_bits);
+ status = -ENOSPC;
+ goto bail;
+ }
+
status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
if (status < 0) {
if (status != -ENOSPC)
@@ -490,7 +513,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
(*ac)->ac_group_search = ocfs2_block_group_search;
status = ocfs2_reserve_suballoc_bits(osb, (*ac),
- EXTENT_ALLOC_SYSTEM_INODE, slot);
+ EXTENT_ALLOC_SYSTEM_INODE,
+ slot, ALLOC_NEW_GROUP);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -508,10 +532,42 @@ bail:
return status;
}
+static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
+ struct ocfs2_alloc_context *ac)
+{
+ int i, status = -ENOSPC;
+ s16 slot = ocfs2_get_inode_steal_slot(osb);
+
+ /* Start to steal inodes from the first slot after ours. */
+ if (slot == OCFS2_INVALID_SLOT)
+ slot = osb->slot_num + 1;
+
+ for (i = 0; i < osb->max_slots; i++, slot++) {
+ if (slot == osb->max_slots)
+ slot = 0;
+
+ if (slot == osb->slot_num)
+ continue;
+
+ status = ocfs2_reserve_suballoc_bits(osb, ac,
+ INODE_ALLOC_SYSTEM_INODE,
+ slot, NOT_ALLOC_NEW_GROUP);
+ if (status >= 0) {
+ ocfs2_set_inode_steal_slot(osb, slot);
+ break;
+ }
+
+ ocfs2_free_ac_resource(ac);
+ }
+
+ return status;
+}
+
int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
struct ocfs2_alloc_context **ac)
{
int status;
+ s16 slot = ocfs2_get_inode_steal_slot(osb);
*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
@@ -525,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
(*ac)->ac_group_search = ocfs2_block_group_search;
+ /*
+ * slot is set when we successfully steal inode from other nodes.
+ * It is reset in 3 places:
+ * 1. when we flush the truncate log
+ * 2. when we complete local alloc recovery.
+ * 3. when we successfully allocate from our own slot.
+ * After it is set, we will go on stealing inodes until we find the
+ * need to check our slots to see whether there is some space for us.
+ */
+ if (slot != OCFS2_INVALID_SLOT &&
+ atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
+ goto inode_steal;
+
+ atomic_set(&osb->s_num_inodes_stolen, 0);
status = ocfs2_reserve_suballoc_bits(osb, *ac,
INODE_ALLOC_SYSTEM_INODE,
- osb->slot_num);
+ osb->slot_num, ALLOC_NEW_GROUP);
+ if (status >= 0) {
+ status = 0;
+
+ /*
+ * Some inodes must be freed by us, so try to allocate
+ * from our own next time.
+ */
+ if (slot != OCFS2_INVALID_SLOT)
+ ocfs2_init_inode_steal_slot(osb);
+ goto bail;
+ } else if (status < 0 && status != -ENOSPC) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ ocfs2_free_ac_resource(*ac);
+
+inode_steal:
+ status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
+ atomic_inc(&osb->s_num_inodes_stolen);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -557,7 +647,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
status = ocfs2_reserve_suballoc_bits(osb, ac,
GLOBAL_BITMAP_SYSTEM_INODE,
- OCFS2_INVALID_SLOT);
+ OCFS2_INVALID_SLOT,
+ ALLOC_NEW_GROUP);
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8799033bb45..544c600662b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *,
struct ocfs2_alloc_context {
struct inode *ac_inode; /* which bitmap are we allocating from? */
struct buffer_head *ac_bh; /* file entry bh */
+ u32 ac_alloc_slot; /* which slot are we allocating from? */
u32 ac_bits_wanted;
u32 ac_bits_given;
#define OCFS2_AC_USE_LOCAL 1
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index bec75aff3d9..df63ba20ae9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -40,8 +40,7 @@
#include <linux/crc32.h>
#include <linux/debugfs.h>
#include <linux/mount.h>
-
-#include <cluster/nodemanager.h>
+#include <linux/seq_file.h>
#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
@@ -88,6 +87,7 @@ struct mount_options
unsigned int atime_quantum;
signed short slot;
unsigned int localalloc_opt;
+ char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
};
static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -109,7 +109,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
static int ocfs2_check_volume(struct ocfs2_super *osb);
static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct buffer_head *bh,
@@ -154,6 +153,7 @@ enum {
Opt_commit,
Opt_localalloc,
Opt_localflocks,
+ Opt_stack,
Opt_err,
};
@@ -172,6 +172,7 @@ static match_table_t tokens = {
{Opt_commit, "commit=%u"},
{Opt_localalloc, "localalloc=%d"},
{Opt_localflocks, "localflocks"},
+ {Opt_stack, "cluster_stack=%s"},
{Opt_err, NULL}
};
@@ -551,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
}
}
+ if (ocfs2_userspace_stack(osb)) {
+ if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ mlog(ML_ERROR, "Userspace stack expected, but "
+ "o2cb heartbeat arguments passed to mount\n");
+ return -EINVAL;
+ }
+ }
+
if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
- if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) {
+ if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
+ !ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Heartbeat has to be started to mount "
"a read-write clustered device.\n");
return -EINVAL;
@@ -562,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
return 0;
}
+/*
+ * If we're using a userspace stack, mount should have passed
+ * a name that matches the disk. If not, mount should not
+ * have passed a stack.
+ */
+static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
+ struct mount_options *mopt)
+{
+ if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
+ mlog(ML_ERROR,
+ "cluster stack passed to mount, but this filesystem "
+ "does not support it\n");
+ return -EINVAL;
+ }
+
+ if (ocfs2_userspace_stack(osb) &&
+ strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
+ OCFS2_STACK_LABEL_LEN)) {
+ mlog(ML_ERROR,
+ "cluster stack passed to mount (\"%s\") does not "
+ "match the filesystem (\"%s\")\n",
+ mopt->cluster_stack,
+ osb->osb_cluster_stack);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
{
struct dentry *root;
@@ -579,15 +618,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
- /* for now we only have one cluster/node, make sure we see it
- * in the heartbeat universe */
- if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
- if (!o2hb_check_local_node_heartbeating()) {
- status = -EINVAL;
- goto read_super_error;
- }
- }
-
/* probe for superblock */
status = ocfs2_sb_probe(sb, &bh, &sector_size);
if (status < 0) {
@@ -609,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->osb_commit_interval = parsed_options.commit_interval;
osb->local_alloc_size = parsed_options.localalloc_opt;
+ status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+ if (status)
+ goto read_super_error;
+
sb->s_magic = OCFS2_SUPER_MAGIC;
/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -694,7 +728,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
- snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+ snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
"with %s data mode.\n",
@@ -763,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
mopt->slot = OCFS2_INVALID_SLOT;
mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ mopt->cluster_stack[0] = '\0';
if (!options) {
status = 1;
@@ -864,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb,
if (!is_remount)
mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
break;
+ case Opt_stack:
+ /* Check both that the option we were passed
+ * is of the right length and that it is a proper
+ * string of the right length.
+ */
+ if (((args[0].to - args[0].from) !=
+ OCFS2_STACK_LABEL_LEN) ||
+ (strnlen(args[0].from,
+ OCFS2_STACK_LABEL_LEN) !=
+ OCFS2_STACK_LABEL_LEN)) {
+ mlog(ML_ERROR,
+ "Invalid cluster_stack option\n");
+ status = 0;
+ goto bail;
+ }
+ memcpy(mopt->cluster_stack, args[0].from,
+ OCFS2_STACK_LABEL_LEN);
+ mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -922,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (opts & OCFS2_MOUNT_LOCALFLOCKS)
seq_printf(s, ",localflocks,");
+ if (osb->osb_cluster_stack[0])
+ seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+ osb->osb_cluster_stack);
+
return 0;
}
@@ -957,6 +1015,8 @@ static int __init ocfs2_init(void)
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
}
+ ocfs2_set_locking_protocol();
+
leave:
if (status < 0) {
ocfs2_free_mem_caches();
@@ -1132,31 +1192,6 @@ static int ocfs2_get_sector(struct super_block *sb,
return 0;
}
-/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
-{
- int status;
-
- /* XXX hold a ref on the node while mounte? easy enough, if
- * desirable. */
- if (ocfs2_mount_local(osb))
- osb->node_num = 0;
- else
- osb->node_num = o2nm_this_node();
-
- if (osb->node_num == O2NM_MAX_NODES) {
- mlog(ML_ERROR, "could not find this host's node number\n");
- status = -ENOENT;
- goto bail;
- }
-
- mlog(0, "I am node %d\n", osb->node_num);
-
- status = 0;
-bail:
- return status;
-}
-
static int ocfs2_mount_volume(struct super_block *sb)
{
int status = 0;
@@ -1168,12 +1203,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (ocfs2_is_hard_readonly(osb))
goto leave;
- status = ocfs2_fill_local_node_info(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
@@ -1224,18 +1253,9 @@ leave:
return status;
}
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
-static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
-{
- mb();
- return osb->recovery_thread_task != NULL;
-}
-
static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
{
- int tmp;
+ int tmp, hangup_needed = 0;
struct ocfs2_super *osb = NULL;
char nodestr[8];
@@ -1249,25 +1269,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_truncate_log_shutdown(osb);
- /* disable any new recovery threads and wait for any currently
- * running ones to exit. Do this before setting the vol_state. */
- mutex_lock(&osb->recovery_lock);
- osb->disable_recovery = 1;
- mutex_unlock(&osb->recovery_lock);
- wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
-
- /* At this point, we know that no more recovery threads can be
- * launched, so wait for any recovery completion work to
- * complete. */
- flush_workqueue(ocfs2_wq);
+ /* This will disable recovery and flush any recovery work. */
+ ocfs2_recovery_exit(osb);
ocfs2_journal_shutdown(osb);
ocfs2_sync_blockdev(sb);
- /* No dlm means we've failed during mount, so skip all the
- * steps which depended on that to complete. */
- if (osb->dlm) {
+ /* No cluster connection means we've failed during mount, so skip
+ * all the steps which depended on that to complete. */
+ if (osb->cconn) {
tmp = ocfs2_super_lock(osb, 1);
if (tmp < 0) {
mlog_errno(tmp);
@@ -1278,25 +1289,34 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
if (osb->slot_num != OCFS2_INVALID_SLOT)
ocfs2_put_slot(osb);
- if (osb->dlm)
+ if (osb->cconn)
ocfs2_super_unlock(osb, 1);
ocfs2_release_system_inodes(osb);
- if (osb->dlm)
- ocfs2_dlm_shutdown(osb);
+ /*
+ * If we're dismounting due to mount error, mount.ocfs2 will clean
+ * up heartbeat. If we're a local mount, there is no heartbeat.
+ * If we failed before we got a uuid_str yet, we can't stop
+ * heartbeat. Otherwise, do it.
+ */
+ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+ hangup_needed = 1;
+
+ if (osb->cconn)
+ ocfs2_dlm_shutdown(osb, hangup_needed);
debugfs_remove(osb->osb_debug_root);
- if (!mnt_err)
- ocfs2_stop_heartbeat(osb);
+ if (hangup_needed)
+ ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
- snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+ snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
osb->dev_str, nodestr);
@@ -1355,7 +1375,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
sb->s_export_op = &ocfs2_export_ops;
- osb->osb_locking_proto = ocfs2_locking_protocol;
sb->s_time_gran = 1;
sb->s_flags |= MS_NOATIME;
/* this is needed to support O_LARGEFILE */
@@ -1368,7 +1387,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->s_sectsize_bits = blksize_bits(sector_size);
BUG_ON(!osb->s_sectsize_bits);
- init_waitqueue_head(&osb->recovery_event);
spin_lock_init(&osb->dc_task_lock);
init_waitqueue_head(&osb->dc_event);
osb->dc_work_sequence = 0;
@@ -1376,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
INIT_LIST_HEAD(&osb->blocked_lock_list);
osb->blocked_lock_count = 0;
spin_lock_init(&osb->osb_lock);
+ ocfs2_init_inode_steal_slot(osb);
atomic_set(&osb->alloc_stats.moves, 0);
atomic_set(&osb->alloc_stats.local_data, 0);
@@ -1388,24 +1407,23 @@ static int ocfs2_initialize_super(struct super_block *sb,
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
- mutex_init(&osb->recovery_lock);
-
- osb->disable_recovery = 0;
- osb->recovery_thread_task = NULL;
+ status = ocfs2_recovery_init(osb);
+ if (status) {
+ mlog(ML_ERROR, "Unable to initialize recovery state\n");
+ mlog_errno(status);
+ goto bail;
+ }
init_waitqueue_head(&osb->checkpoint_event);
atomic_set(&osb->needs_checkpoint, 0);
osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
- osb->node_num = O2NM_INVALID_NODE_NUM;
osb->slot_num = OCFS2_INVALID_SLOT;
osb->local_alloc_state = OCFS2_LA_UNUSED;
osb->local_alloc_bh = NULL;
- ocfs2_setup_hb_callbacks(osb);
-
init_waitqueue_head(&osb->osb_mount_event);
osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
@@ -1455,6 +1473,25 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
+ if (ocfs2_userspace_stack(osb)) {
+ memcpy(osb->osb_cluster_stack,
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
+ OCFS2_STACK_LABEL_LEN);
+ osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
+ mlog(ML_ERROR,
+ "couldn't mount because of an invalid "
+ "cluster stack label (%s) \n",
+ osb->osb_cluster_stack);
+ status = -EINVAL;
+ goto bail;
+ }
+ } else {
+ /* The empty string is identical with classic tools that
+ * don't know about s_cluster_info. */
+ osb->osb_cluster_stack[0] = '\0';
+ }
+
get_random_bytes(&osb->s_next_generation, sizeof(u32));
/* FIXME
@@ -1724,8 +1761,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
/* This function assumes that the caller has the main osb resource */
- if (osb->slot_info)
- ocfs2_free_slot_info(osb->slot_info);
+ ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
/* FIXME
diff --git a/fs/open.c b/fs/open.c
index 3fa4e4ffce4..b70e7666bb2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -244,21 +244,21 @@ static long do_sys_truncate(const char __user * path, loff_t length)
if (!S_ISREG(inode->i_mode))
goto dput_and_out;
- error = vfs_permission(&nd, MAY_WRITE);
+ error = mnt_want_write(nd.path.mnt);
if (error)
goto dput_and_out;
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto dput_and_out;
+ error = vfs_permission(&nd, MAY_WRITE);
+ if (error)
+ goto mnt_drop_write_and_out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
error = get_write_access(inode);
if (error)
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
/*
* Make sure that there are no leases. get_write_access() protects
@@ -276,6 +276,8 @@ static long do_sys_truncate(const char __user * path, loff_t length)
put_write_and_out:
put_write_access(inode);
+mnt_drop_write_and_out:
+ mnt_drop_write(nd.path.mnt);
dput_and_out:
path_put(&nd.path);
out:
@@ -457,8 +459,17 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
if(res || !(mode & S_IWOTH) ||
special_file(nd.path.dentry->d_inode->i_mode))
goto out_path_release;
-
- if(IS_RDONLY(nd.path.dentry->d_inode))
+ /*
+ * This is a rare case where using __mnt_is_readonly()
+ * is OK without a mnt_want/drop_write() pair. Since
+ * no actual write to the fs is performed here, we do
+ * not need to telegraph to that to anyone.
+ *
+ * By doing this, we accept that this access is
+ * inherently racy and know that the fs may change
+ * state before we even see this result.
+ */
+ if (__mnt_is_readonly(nd.path.mnt))
res = -EROFS;
out_path_release:
@@ -567,12 +578,12 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
audit_inode(NULL, dentry);
- err = -EROFS;
- if (IS_RDONLY(inode))
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
goto out_putf;
err = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out_putf;
+ goto out_drop_write;
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
mode = inode->i_mode;
@@ -581,6 +592,8 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
err = notify_change(dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+out_drop_write:
+ mnt_drop_write(file->f_path.mnt);
out_putf:
fput(file);
out:
@@ -600,13 +613,13 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
goto out;
inode = nd.path.dentry->d_inode;
- error = -EROFS;
- if (IS_RDONLY(inode))
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
goto dput_and_out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto dput_and_out;
+ goto out_drop_write;
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
@@ -616,6 +629,8 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
error = notify_change(nd.path.dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+out_drop_write:
+ mnt_drop_write(nd.path.mnt);
dput_and_out:
path_put(&nd.path);
out:
@@ -638,9 +653,6 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
printk(KERN_ERR "chown_common: NULL inode\n");
goto out;
}
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
goto out;
@@ -671,7 +683,12 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
error = user_path_walk(filename, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.path.dentry, user, group);
+ mnt_drop_write(nd.path.mnt);
+out_release:
path_put(&nd.path);
out:
return error;
@@ -691,7 +708,12 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
error = __user_walk_fd(dfd, filename, follow, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.path.dentry, user, group);
+ mnt_drop_write(nd.path.mnt);
+out_release:
path_put(&nd.path);
out:
return error;
@@ -705,7 +727,12 @@ asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group
error = user_path_walk_link(filename, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.path.dentry, user, group);
+ mnt_drop_write(nd.path.mnt);
+out_release:
path_put(&nd.path);
out:
return error;
@@ -722,14 +749,48 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
if (!file)
goto out;
+ error = mnt_want_write(file->f_path.mnt);
+ if (error)
+ goto out_fput;
dentry = file->f_path.dentry;
audit_inode(NULL, dentry);
error = chown_common(dentry, user, group);
+ mnt_drop_write(file->f_path.mnt);
+out_fput:
fput(file);
out:
return error;
}
+/*
+ * You have to be very careful that these write
+ * counts get cleaned up in error cases and
+ * upon __fput(). This should probably never
+ * be called outside of __dentry_open().
+ */
+static inline int __get_file_write_access(struct inode *inode,
+ struct vfsmount *mnt)
+{
+ int error;
+ error = get_write_access(inode);
+ if (error)
+ return error;
+ /*
+ * Do not take mount writer counts on
+ * special files since no writes to
+ * the mount itself will occur.
+ */
+ if (!special_file(inode->i_mode)) {
+ /*
+ * Balanced in __fput()
+ */
+ error = mnt_want_write(mnt);
+ if (error)
+ put_write_access(inode);
+ }
+ return error;
+}
+
static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
int flags, struct file *f,
int (*open)(struct inode *, struct file *))
@@ -742,9 +803,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
FMODE_PREAD | FMODE_PWRITE;
inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
- error = get_write_access(inode);
+ error = __get_file_write_access(inode, mnt);
if (error)
goto cleanup_file;
+ if (!special_file(inode->i_mode))
+ file_take_write(f);
}
f->f_mapping = inode->i_mapping;
@@ -784,8 +847,19 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
cleanup_all:
fops_put(f->f_op);
- if (f->f_mode & FMODE_WRITE)
+ if (f->f_mode & FMODE_WRITE) {
put_write_access(inode);
+ if (!special_file(inode->i_mode)) {
+ /*
+ * We don't consider this a real
+ * mnt_want/drop_write() pair
+ * because it all happenend right
+ * here, so just reset the state.
+ */
+ file_reset_write(f);
+ mnt_drop_write(mnt);
+ }
+ }
file_kill(f);
f->f_path.dentry = NULL;
f->f_path.mnt = NULL;
@@ -796,43 +870,6 @@ cleanup_file:
return ERR_PTR(error);
}
-/*
- * Note that while the flag value (low two bits) for sys_open means:
- * 00 - read-only
- * 01 - write-only
- * 10 - read-write
- * 11 - special
- * it is changed into
- * 00 - no permissions needed
- * 01 - read-permission
- * 10 - write-permission
- * 11 - read-write
- * for the internal routines (ie open_namei()/follow_link() etc). 00 is
- * used by symlinks.
- */
-static struct file *do_filp_open(int dfd, const char *filename, int flags,
- int mode)
-{
- int namei_flags, error;
- struct nameidata nd;
-
- namei_flags = flags;
- if ((namei_flags+1) & O_ACCMODE)
- namei_flags++;
-
- error = open_namei(dfd, filename, namei_flags, mode, &nd);
- if (!error)
- return nameidata_to_filp(&nd, flags);
-
- return ERR_PTR(error);
-}
-
-struct file *filp_open(const char *filename, int flags, int mode)
-{
- return do_filp_open(AT_FDCWD, filename, flags, mode);
-}
-EXPORT_SYMBOL(filp_open);
-
/**
* lookup_instantiate_filp - instantiates the open intent filp
* @nd: pointer to nameidata
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 03f808c5b79..6149e4b58c8 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -473,6 +473,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
return 0;
if (IS_ERR(state)) /* I/O error reading the partition table */
return -EIO;
+
+ /* tell userspace that the media / partition table may have changed */
+ kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
+
for (p = 1; p < state->limit; p++) {
sector_t size = state->parts[p].size;
sector_t from = state->parts[p].from;
diff --git a/fs/pipe.c b/fs/pipe.c
index 8be381bbcb5..f73492b6817 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -988,7 +988,10 @@ struct file *create_write_pipe(void)
return f;
err_dentry:
+ free_pipe_info(inode);
dput(dentry);
+ return ERR_PTR(err);
+
err_inode:
free_pipe_info(inode);
iput(inode);
diff --git a/fs/pnode.c b/fs/pnode.c
index 1d8f5447f3f..8d5f392ec3d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -9,6 +9,7 @@
#include <linux/mnt_namespace.h>
#include <linux/mount.h>
#include <linux/fs.h>
+#include "internal.h"
#include "pnode.h"
/* return the next shared peer mount of @p */
@@ -27,6 +28,57 @@ static inline struct vfsmount *next_slave(struct vfsmount *p)
return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
}
+/*
+ * Return true if path is reachable from root
+ *
+ * namespace_sem is held, and mnt is attached
+ */
+static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry,
+ const struct path *root)
+{
+ while (mnt != root->mnt && mnt->mnt_parent != mnt) {
+ dentry = mnt->mnt_mountpoint;
+ mnt = mnt->mnt_parent;
+ }
+ return mnt == root->mnt && is_subdir(dentry, root->dentry);
+}
+
+static struct vfsmount *get_peer_under_root(struct vfsmount *mnt,
+ struct mnt_namespace *ns,
+ const struct path *root)
+{
+ struct vfsmount *m = mnt;
+
+ do {
+ /* Check the namespace first for optimization */
+ if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root))
+ return m;
+
+ m = next_peer(m);
+ } while (m != mnt);
+
+ return NULL;
+}
+
+/*
+ * Get ID of closest dominating peer group having a representative
+ * under the given root.
+ *
+ * Caller must hold namespace_sem
+ */
+int get_dominating_id(struct vfsmount *mnt, const struct path *root)
+{
+ struct vfsmount *m;
+
+ for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
+ struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root);
+ if (d)
+ return d->mnt_group_id;
+ }
+
+ return 0;
+}
+
static int do_make_slave(struct vfsmount *mnt)
{
struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master;
@@ -45,7 +97,11 @@ static int do_make_slave(struct vfsmount *mnt)
if (peer_mnt == mnt)
peer_mnt = NULL;
}
+ if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share))
+ mnt_release_group_id(mnt);
+
list_del_init(&mnt->mnt_share);
+ mnt->mnt_group_id = 0;
if (peer_mnt)
master = peer_mnt;
@@ -67,7 +123,6 @@ static int do_make_slave(struct vfsmount *mnt)
}
mnt->mnt_master = master;
CLEAR_MNT_SHARED(mnt);
- INIT_LIST_HEAD(&mnt->mnt_slave_list);
return 0;
}
@@ -211,8 +266,7 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
out:
spin_lock(&vfsmount_lock);
while (!list_empty(&tmp_list)) {
- child = list_entry(tmp_list.next, struct vfsmount, mnt_hash);
- list_del_init(&child->mnt_hash);
+ child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
umount_tree(child, 0, &umount_list);
}
spin_unlock(&vfsmount_lock);
diff --git a/fs/pnode.h b/fs/pnode.h
index f249be2fee7..958665d662a 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -35,4 +35,6 @@ int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
struct list_head *);
int propagate_umount(struct list_head *);
int propagate_mount_busy(struct vfsmount *, int);
+void mnt_release_group_id(struct vfsmount *);
+int get_dominating_id(struct vfsmount *mnt, const struct path *root);
#endif /* _LINUX_PNODE_H */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 81d7d145292..c5e412a00b1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -502,17 +502,14 @@ static const struct inode_operations proc_def_inode_operations = {
.setattr = proc_setattr,
};
-extern const struct seq_operations mounts_op;
-struct proc_mounts {
- struct seq_file m;
- int event;
-};
-
-static int mounts_open(struct inode *inode, struct file *file)
+static int mounts_open_common(struct inode *inode, struct file *file,
+ const struct seq_operations *op)
{
struct task_struct *task = get_proc_task(inode);
struct nsproxy *nsp;
struct mnt_namespace *ns = NULL;
+ struct fs_struct *fs = NULL;
+ struct path root;
struct proc_mounts *p;
int ret = -EINVAL;
@@ -525,40 +522,61 @@ static int mounts_open(struct inode *inode, struct file *file)
get_mnt_ns(ns);
}
rcu_read_unlock();
-
+ if (ns)
+ fs = get_fs_struct(task);
put_task_struct(task);
}
- if (ns) {
- ret = -ENOMEM;
- p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
- if (p) {
- file->private_data = &p->m;
- ret = seq_open(file, &mounts_op);
- if (!ret) {
- p->m.private = ns;
- p->event = ns->event;
- return 0;
- }
- kfree(p);
- }
- put_mnt_ns(ns);
- }
+ if (!ns)
+ goto err;
+ if (!fs)
+ goto err_put_ns;
+
+ read_lock(&fs->lock);
+ root = fs->root;
+ path_get(&root);
+ read_unlock(&fs->lock);
+ put_fs_struct(fs);
+
+ ret = -ENOMEM;
+ p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
+ if (!p)
+ goto err_put_path;
+
+ file->private_data = &p->m;
+ ret = seq_open(file, op);
+ if (ret)
+ goto err_free;
+
+ p->m.private = p;
+ p->ns = ns;
+ p->root = root;
+ p->event = ns->event;
+
+ return 0;
+
+ err_free:
+ kfree(p);
+ err_put_path:
+ path_put(&root);
+ err_put_ns:
+ put_mnt_ns(ns);
+ err:
return ret;
}
static int mounts_release(struct inode *inode, struct file *file)
{
- struct seq_file *m = file->private_data;
- struct mnt_namespace *ns = m->private;
- put_mnt_ns(ns);
+ struct proc_mounts *p = file->private_data;
+ path_put(&p->root);
+ put_mnt_ns(p->ns);
return seq_release(inode, file);
}
static unsigned mounts_poll(struct file *file, poll_table *wait)
{
struct proc_mounts *p = file->private_data;
- struct mnt_namespace *ns = p->m.private;
+ struct mnt_namespace *ns = p->ns;
unsigned res = 0;
poll_wait(file, &ns->poll, wait);
@@ -573,6 +591,11 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
return res;
}
+static int mounts_open(struct inode *inode, struct file *file)
+{
+ return mounts_open_common(inode, file, &mounts_op);
+}
+
static const struct file_operations proc_mounts_operations = {
.open = mounts_open,
.read = seq_read,
@@ -581,38 +604,22 @@ static const struct file_operations proc_mounts_operations = {
.poll = mounts_poll,
};
-extern const struct seq_operations mountstats_op;
-static int mountstats_open(struct inode *inode, struct file *file)
+static int mountinfo_open(struct inode *inode, struct file *file)
{
- int ret = seq_open(file, &mountstats_op);
-
- if (!ret) {
- struct seq_file *m = file->private_data;
- struct nsproxy *nsp;
- struct mnt_namespace *mnt_ns = NULL;
- struct task_struct *task = get_proc_task(inode);
-
- if (task) {
- rcu_read_lock();
- nsp = task_nsproxy(task);
- if (nsp) {
- mnt_ns = nsp->mnt_ns;
- if (mnt_ns)
- get_mnt_ns(mnt_ns);
- }
- rcu_read_unlock();
+ return mounts_open_common(inode, file, &mountinfo_op);
+}
- put_task_struct(task);
- }
+static const struct file_operations proc_mountinfo_operations = {
+ .open = mountinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = mounts_release,
+ .poll = mounts_poll,
+};
- if (mnt_ns)
- m->private = mnt_ns;
- else {
- seq_release(inode, file);
- ret = -EINVAL;
- }
- }
- return ret;
+static int mountstats_open(struct inode *inode, struct file *file)
+{
+ return mounts_open_common(inode, file, &mountstats_op);
}
static const struct file_operations proc_mountstats_operations = {
@@ -1626,7 +1633,6 @@ static int proc_readfd_common(struct file * filp, void * dirent,
unsigned int fd, ino;
int retval;
struct files_struct * files;
- struct fdtable *fdt;
retval = -ENOENT;
if (!p)
@@ -1649,9 +1655,8 @@ static int proc_readfd_common(struct file * filp, void * dirent,
if (!files)
goto out;
rcu_read_lock();
- fdt = files_fdtable(files);
for (fd = filp->f_pos-2;
- fd < fdt->max_fds;
+ fd < files_fdtable(files)->max_fds;
fd++, filp->f_pos++) {
char name[PROC_NUMBUF];
int len;
@@ -2311,6 +2316,7 @@ static const struct pid_entry tgid_base_stuff[] = {
LNK("root", root),
LNK("exe", exe),
REG("mounts", S_IRUGO, mounts),
+ REG("mountinfo", S_IRUGO, mountinfo),
REG("mountstats", S_IRUSR, mountstats),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, clear_refs),
@@ -2643,6 +2649,7 @@ static const struct pid_entry tid_base_stuff[] = {
LNK("root", root),
LNK("exe", exe),
REG("mounts", S_IRUGO, mounts),
+ REG("mountinfo", S_IRUGO, mountinfo),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, clear_refs),
REG("smaps", S_IRUGO, smaps),
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4caa5f774fb..13cd7835d0d 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -44,7 +44,9 @@ int seq_open_net(struct inode *ino, struct file *f,
put_net(net);
return -ENOMEM;
}
+#ifdef CONFIG_NET_NS
p->net = net;
+#endif
return 0;
}
EXPORT_SYMBOL_GPL(seq_open_net);
@@ -52,12 +54,10 @@ EXPORT_SYMBOL_GPL(seq_open_net);
int seq_release_net(struct inode *ino, struct file *f)
{
struct seq_file *seq;
- struct seq_net_private *p;
seq = f->private_data;
- p = seq->private;
- put_net(p->net);
+ put_net(seq_file_net(seq));
seq_release_private(ino, f);
return 0;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 49a98718ecd..f0d1240a5c6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(generic_ro_fops);
loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
- long long retval;
+ loff_t retval;
struct inode *inode = file->f_mapping->host;
mutex_lock(&inode->i_mutex);
@@ -60,7 +60,7 @@ EXPORT_SYMBOL(generic_file_llseek);
loff_t remote_llseek(struct file *file, loff_t offset, int origin)
{
- long long retval;
+ loff_t retval;
lock_kernel();
switch (origin) {
@@ -91,7 +91,7 @@ EXPORT_SYMBOL(no_llseek);
loff_t default_llseek(struct file *file, loff_t offset, int origin)
{
- long long retval;
+ loff_t retval;
lock_kernel();
switch (origin) {
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index e0f0f098a52..74363a7aacb 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -4,6 +4,7 @@
#include <linux/capability.h>
#include <linux/fs.h>
+#include <linux/mount.h>
#include <linux/reiserfs_fs.h>
#include <linux/time.h>
#include <asm/uaccess.h>
@@ -25,6 +26,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
unsigned long arg)
{
unsigned int flags;
+ int err = 0;
switch (cmd) {
case REISERFS_IOC_UNPACK:
@@ -48,50 +50,67 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
if (!reiserfs_attrs(inode->i_sb))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EPERM;
-
- if (get_user(flags, (int __user *)arg))
- return -EFAULT;
-
- /* Is it quota file? Do not allow user to mess with it. */
- if (IS_NOQUOTA(inode))
- return -EPERM;
+ if (!is_owner_or_cap(inode)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
+ if (get_user(flags, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setflags_out;
+ }
+ /*
+ * Is it quota file? Do not allow user to mess with it
+ */
+ if (IS_NOQUOTA(inode)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
if (((flags ^ REISERFS_I(inode)->
i_attrs) & (REISERFS_IMMUTABLE_FL |
REISERFS_APPEND_FL))
- && !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
+ && !capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
if ((flags & REISERFS_NOTAIL_FL) &&
S_ISREG(inode->i_mode)) {
int result;
result = reiserfs_unpack(inode, filp);
- if (result)
- return result;
+ if (result) {
+ err = result;
+ goto setflags_out;
+ }
}
sd_attrs_to_i_attrs(flags, inode);
REISERFS_I(inode)->i_attrs = flags;
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
}
case REISERFS_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *)arg);
case REISERFS_IOC_SETVERSION:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(inode->i_generation, (int __user *)arg))
- return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ if (get_user(inode->i_generation, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setversion_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
default:
return -ENOTTY;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bb05a3e51b9..060eb3f598e 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -38,7 +38,7 @@
#include <asm/system.h>
#include <linux/time.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
#include <linux/vmalloc.h>
#include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 344b9b96cc5..d7c4935c103 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -44,7 +44,6 @@
#include <net/checksum.h>
#include <linux/smp_lock.h>
#include <linux/stat.h>
-#include <asm/semaphore.h>
#define FL_READONLY 128
#define FL_DIR_SEM_HELD 256
diff --git a/fs/select.c b/fs/select.c
index 5633fe98078..00f58c5c7e0 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -260,7 +260,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
wait = NULL;
if (retval || !*timeout || signal_pending(current))
break;
- if(table.error) {
+ if (table.error) {
retval = table.error;
break;
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 853770274f2..3f54dbd6c49 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -25,6 +25,7 @@
* into the buffer. In case of error ->start() and ->next() return
* ERR_PTR(error). In the end of sequence they return %NULL. ->show()
* returns 0 in case of success and negative number in case of error.
+ * Returning SEQ_SKIP means "discard this element and move on".
*/
int seq_open(struct file *file, const struct seq_operations *op)
{
@@ -114,8 +115,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
if (!p || IS_ERR(p))
break;
err = m->op->show(m, p);
- if (err)
+ if (err < 0)
break;
+ if (unlikely(err))
+ m->count = 0;
if (m->count < m->size)
goto Fill;
m->op->stop(m, p);
@@ -140,9 +143,10 @@ Fill:
break;
}
err = m->op->show(m, p);
- if (err || m->count == m->size) {
+ if (m->count == m->size || err) {
m->count = offs;
- break;
+ if (likely(err <= 0))
+ break;
}
pos = next;
}
@@ -199,8 +203,12 @@ static int traverse(struct seq_file *m, loff_t offset)
if (IS_ERR(p))
break;
error = m->op->show(m, p);
- if (error)
+ if (error < 0)
break;
+ if (unlikely(error)) {
+ error = 0;
+ m->count = 0;
+ }
if (m->count == m->size)
goto Eoverflow;
if (pos + m->count > offset) {
@@ -239,7 +247,7 @@ Eoverflow:
loff_t seq_lseek(struct file *file, loff_t offset, int origin)
{
struct seq_file *m = (struct seq_file *)file->private_data;
- long long retval = -EINVAL;
+ loff_t retval = -EINVAL;
mutex_lock(&m->lock);
m->version = file->f_version;
@@ -342,28 +350,40 @@ int seq_printf(struct seq_file *m, const char *f, ...)
}
EXPORT_SYMBOL(seq_printf);
+static char *mangle_path(char *s, char *p, char *esc)
+{
+ while (s <= p) {
+ char c = *p++;
+ if (!c) {
+ return s;
+ } else if (!strchr(esc, c)) {
+ *s++ = c;
+ } else if (s + 4 > p) {
+ break;
+ } else {
+ *s++ = '\\';
+ *s++ = '0' + ((c & 0300) >> 6);
+ *s++ = '0' + ((c & 070) >> 3);
+ *s++ = '0' + (c & 07);
+ }
+ }
+ return NULL;
+}
+
+/*
+ * return the absolute path of 'dentry' residing in mount 'mnt'.
+ */
int seq_path(struct seq_file *m, struct path *path, char *esc)
{
if (m->count < m->size) {
char *s = m->buf + m->count;
char *p = d_path(path, s, m->size - m->count);
if (!IS_ERR(p)) {
- while (s <= p) {
- char c = *p++;
- if (!c) {
- p = m->buf + m->count;
- m->count = s - m->buf;
- return s - p;
- } else if (!strchr(esc, c)) {
- *s++ = c;
- } else if (s + 4 > p) {
- break;
- } else {
- *s++ = '\\';
- *s++ = '0' + ((c & 0300) >> 6);
- *s++ = '0' + ((c & 070) >> 3);
- *s++ = '0' + (c & 07);
- }
+ s = mangle_path(s, p, esc);
+ if (s) {
+ p = m->buf + m->count;
+ m->count = s - m->buf;
+ return s - p;
}
}
}
@@ -372,6 +392,57 @@ int seq_path(struct seq_file *m, struct path *path, char *esc)
}
EXPORT_SYMBOL(seq_path);
+/*
+ * Same as seq_path, but relative to supplied root.
+ *
+ * root may be changed, see __d_path().
+ */
+int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
+ char *esc)
+{
+ int err = -ENAMETOOLONG;
+ if (m->count < m->size) {
+ char *s = m->buf + m->count;
+ char *p;
+
+ spin_lock(&dcache_lock);
+ p = __d_path(path, root, s, m->size - m->count);
+ spin_unlock(&dcache_lock);
+ err = PTR_ERR(p);
+ if (!IS_ERR(p)) {
+ s = mangle_path(s, p, esc);
+ if (s) {
+ p = m->buf + m->count;
+ m->count = s - m->buf;
+ return 0;
+ }
+ }
+ }
+ m->count = m->size;
+ return err;
+}
+
+/*
+ * returns the path of the 'dentry' from the root of its filesystem.
+ */
+int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
+{
+ if (m->count < m->size) {
+ char *s = m->buf + m->count;
+ char *p = dentry_path(dentry, s, m->size - m->count);
+ if (!IS_ERR(p)) {
+ s = mangle_path(s, p, esc);
+ if (s) {
+ p = m->buf + m->count;
+ m->count = s - m->buf;
+ return s - p;
+ }
+ }
+ }
+ m->count = m->size;
+ return -1;
+}
+
static void *single_start(struct seq_file *p, loff_t *pos)
{
return NULL + (*pos == 0);
diff --git a/fs/super.c b/fs/super.c
index 09008dbd264..4798350b2bc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,7 +37,9 @@
#include <linux/idr.h>
#include <linux/kobject.h>
#include <linux/mutex.h>
+#include <linux/file.h>
#include <asm/uaccess.h>
+#include "internal.h"
LIST_HEAD(super_blocks);
@@ -567,10 +569,29 @@ static void mark_files_ro(struct super_block *sb)
{
struct file *f;
+retry:
file_list_lock();
list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
- if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
- f->f_mode &= ~FMODE_WRITE;
+ struct vfsmount *mnt;
+ if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+ continue;
+ if (!file_count(f))
+ continue;
+ if (!(f->f_mode & FMODE_WRITE))
+ continue;
+ f->f_mode &= ~FMODE_WRITE;
+ if (file_check_writeable(f) != 0)
+ continue;
+ file_release_write(f);
+ mnt = mntget(f->f_path.mnt);
+ file_list_unlock();
+ /*
+ * This can sleep, so we can't hold
+ * the file_list_lock() spinlock.
+ */
+ mnt_drop_write(mnt);
+ mntput(mnt);
+ goto retry;
}
file_list_unlock();
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4948d9bc405..a1c3a1fab7f 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -20,6 +20,7 @@
#include <linux/idr.h>
#include <linux/completion.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
#include "sysfs.h"
DEFINE_MUTEX(sysfs_mutex);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index baa663e6938..ade9a7e6a75 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/kobject.h>
#include <linux/kallsyms.h>
+#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/list.h>
@@ -128,7 +129,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
ssize_t retval = 0;
mutex_lock(&buffer->mutex);
- if (buffer->needs_read_fill) {
+ if (buffer->needs_read_fill || *ppos == 0) {
retval = fill_read_buffer(file->f_path.dentry,buffer);
if (retval)
goto out;
@@ -409,8 +410,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
* return POLLERR|POLLPRI, and select will return the fd whether
* it is waiting for read, write, or exceptions.
* Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, as simply seeking and reading
- * again will not get new data, or reset the state of 'poll'.
+ * need to close and re-open the file, or seek to 0 and read again.
* Reminder: this only works for attributes which actively support
* it, and it is not possible to test an attribute from userspace
* to see if it supports poll (Neither 'poll' nor 'select' return
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 5f66c446615..817f5966edc 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -87,7 +87,14 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
void sysfs_remove_link(struct kobject * kobj, const char * name)
{
- sysfs_hash_and_remove(kobj->sd, name);
+ struct sysfs_dirent *parent_sd = NULL;
+
+ if (!kobj)
+ parent_sd = &sysfs_root;
+ else
+ parent_sd = kobj->sd;
+
+ sysfs_hash_and_remove(parent_sd, name);
}
static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index be845e7540e..0d4503f7446 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -6,4 +6,4 @@ obj-$(CONFIG_UDF_FS) += udf.o
udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
partition.o super.o truncate.o symlink.o fsync.o \
- crc.o directory.o misc.o udftime.o unicode.o
+ directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index f855dcbbdfb..1b809bd494b 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -149,8 +149,7 @@ static bool udf_add_free_space(struct udf_sb_info *sbi,
return false;
lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
- lvid->freeSpaceTable[partition] = cpu_to_le32(le32_to_cpu(
- lvid->freeSpaceTable[partition]) + cnt);
+ le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
return true;
}
@@ -589,10 +588,8 @@ static void udf_table_free_blocks(struct super_block *sb,
sptr = oepos.bh->b_data + epos.offset;
aed = (struct allocExtDesc *)
oepos.bh->b_data;
- aed->lengthAllocDescs =
- cpu_to_le32(le32_to_cpu(
- aed->lengthAllocDescs) +
- adsize);
+ le32_add_cpu(&aed->lengthAllocDescs,
+ adsize);
} else {
sptr = iinfo->i_ext.i_data +
epos.offset;
@@ -645,9 +642,7 @@ static void udf_table_free_blocks(struct super_block *sb,
mark_inode_dirty(table);
} else {
aed = (struct allocExtDesc *)epos.bh->b_data;
- aed->lengthAllocDescs =
- cpu_to_le32(le32_to_cpu(
- aed->lengthAllocDescs) + adsize);
+ le32_add_cpu(&aed->lengthAllocDescs, adsize);
udf_update_tag(epos.bh->b_data, epos.offset);
mark_buffer_dirty(epos.bh);
}
diff --git a/fs/udf/crc.c b/fs/udf/crc.c
deleted file mode 100644
index b1661296e78..00000000000
--- a/fs/udf/crc.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * crc.c
- *
- * PURPOSE
- * Routines to generate, calculate, and test a 16-bit CRC.
- *
- * DESCRIPTION
- * The CRC code was devised by Don P. Mitchell of AT&T Bell Laboratories
- * and Ned W. Rhodes of Software Systems Group. It has been published in
- * "Design and Validation of Computer Protocols", Prentice Hall,
- * Englewood Cliffs, NJ, 1991, Chapter 3, ISBN 0-13-539925-4.
- *
- * Copyright is held by AT&T.
- *
- * AT&T gives permission for the free use of the CRC source code.
- *
- * COPYRIGHT
- * This file is distributed under the terms of the GNU General Public
- * License (GPL). Copies of the GPL can be obtained from:
- * ftp://prep.ai.mit.edu/pub/gnu/GPL
- * Each contributing author retains all rights to their own work.
- */
-
-#include "udfdecl.h"
-
-static uint16_t crc_table[256] = {
- 0x0000U, 0x1021U, 0x2042U, 0x3063U, 0x4084U, 0x50a5U, 0x60c6U, 0x70e7U,
- 0x8108U, 0x9129U, 0xa14aU, 0xb16bU, 0xc18cU, 0xd1adU, 0xe1ceU, 0xf1efU,
- 0x1231U, 0x0210U, 0x3273U, 0x2252U, 0x52b5U, 0x4294U, 0x72f7U, 0x62d6U,
- 0x9339U, 0x8318U, 0xb37bU, 0xa35aU, 0xd3bdU, 0xc39cU, 0xf3ffU, 0xe3deU,
- 0x2462U, 0x3443U, 0x0420U, 0x1401U, 0x64e6U, 0x74c7U, 0x44a4U, 0x5485U,
- 0xa56aU, 0xb54bU, 0x8528U, 0x9509U, 0xe5eeU, 0xf5cfU, 0xc5acU, 0xd58dU,
- 0x3653U, 0x2672U, 0x1611U, 0x0630U, 0x76d7U, 0x66f6U, 0x5695U, 0x46b4U,
- 0xb75bU, 0xa77aU, 0x9719U, 0x8738U, 0xf7dfU, 0xe7feU, 0xd79dU, 0xc7bcU,
- 0x48c4U, 0x58e5U, 0x6886U, 0x78a7U, 0x0840U, 0x1861U, 0x2802U, 0x3823U,
- 0xc9ccU, 0xd9edU, 0xe98eU, 0xf9afU, 0x8948U, 0x9969U, 0xa90aU, 0xb92bU,
- 0x5af5U, 0x4ad4U, 0x7ab7U, 0x6a96U, 0x1a71U, 0x0a50U, 0x3a33U, 0x2a12U,
- 0xdbfdU, 0xcbdcU, 0xfbbfU, 0xeb9eU, 0x9b79U, 0x8b58U, 0xbb3bU, 0xab1aU,
- 0x6ca6U, 0x7c87U, 0x4ce4U, 0x5cc5U, 0x2c22U, 0x3c03U, 0x0c60U, 0x1c41U,
- 0xedaeU, 0xfd8fU, 0xcdecU, 0xddcdU, 0xad2aU, 0xbd0bU, 0x8d68U, 0x9d49U,
- 0x7e97U, 0x6eb6U, 0x5ed5U, 0x4ef4U, 0x3e13U, 0x2e32U, 0x1e51U, 0x0e70U,
- 0xff9fU, 0xefbeU, 0xdfddU, 0xcffcU, 0xbf1bU, 0xaf3aU, 0x9f59U, 0x8f78U,
- 0x9188U, 0x81a9U, 0xb1caU, 0xa1ebU, 0xd10cU, 0xc12dU, 0xf14eU, 0xe16fU,
- 0x1080U, 0x00a1U, 0x30c2U, 0x20e3U, 0x5004U, 0x4025U, 0x7046U, 0x6067U,
- 0x83b9U, 0x9398U, 0xa3fbU, 0xb3daU, 0xc33dU, 0xd31cU, 0xe37fU, 0xf35eU,
- 0x02b1U, 0x1290U, 0x22f3U, 0x32d2U, 0x4235U, 0x5214U, 0x6277U, 0x7256U,
- 0xb5eaU, 0xa5cbU, 0x95a8U, 0x8589U, 0xf56eU, 0xe54fU, 0xd52cU, 0xc50dU,
- 0x34e2U, 0x24c3U, 0x14a0U, 0x0481U, 0x7466U, 0x6447U, 0x5424U, 0x4405U,
- 0xa7dbU, 0xb7faU, 0x8799U, 0x97b8U, 0xe75fU, 0xf77eU, 0xc71dU, 0xd73cU,
- 0x26d3U, 0x36f2U, 0x0691U, 0x16b0U, 0x6657U, 0x7676U, 0x4615U, 0x5634U,
- 0xd94cU, 0xc96dU, 0xf90eU, 0xe92fU, 0x99c8U, 0x89e9U, 0xb98aU, 0xa9abU,
- 0x5844U, 0x4865U, 0x7806U, 0x6827U, 0x18c0U, 0x08e1U, 0x3882U, 0x28a3U,
- 0xcb7dU, 0xdb5cU, 0xeb3fU, 0xfb1eU, 0x8bf9U, 0x9bd8U, 0xabbbU, 0xbb9aU,
- 0x4a75U, 0x5a54U, 0x6a37U, 0x7a16U, 0x0af1U, 0x1ad0U, 0x2ab3U, 0x3a92U,
- 0xfd2eU, 0xed0fU, 0xdd6cU, 0xcd4dU, 0xbdaaU, 0xad8bU, 0x9de8U, 0x8dc9U,
- 0x7c26U, 0x6c07U, 0x5c64U, 0x4c45U, 0x3ca2U, 0x2c83U, 0x1ce0U, 0x0cc1U,
- 0xef1fU, 0xff3eU, 0xcf5dU, 0xdf7cU, 0xaf9bU, 0xbfbaU, 0x8fd9U, 0x9ff8U,
- 0x6e17U, 0x7e36U, 0x4e55U, 0x5e74U, 0x2e93U, 0x3eb2U, 0x0ed1U, 0x1ef0U
-};
-
-/*
- * udf_crc
- *
- * PURPOSE
- * Calculate a 16-bit CRC checksum using ITU-T V.41 polynomial.
- *
- * DESCRIPTION
- * The OSTA-UDF(tm) 1.50 standard states that using CRCs is mandatory.
- * The polynomial used is: x^16 + x^12 + x^15 + 1
- *
- * PRE-CONDITIONS
- * data Pointer to the data block.
- * size Size of the data block.
- *
- * POST-CONDITIONS
- * <return> CRC of the data block.
- *
- * HISTORY
- * July 21, 1997 - Andrew E. Mileski
- * Adapted from OSTA-UDF(tm) 1.50 standard.
- */
-uint16_t udf_crc(uint8_t *data, uint32_t size, uint16_t crc)
-{
- while (size--)
- crc = crc_table[(crc >> 8 ^ *(data++)) & 0xffU] ^ (crc << 8);
-
- return crc;
-}
-
-/****************************************************************************/
-#if defined(TEST)
-
-/*
- * PURPOSE
- * Test udf_crc()
- *
- * HISTORY
- * July 21, 1997 - Andrew E. Mileski
- * Adapted from OSTA-UDF(tm) 1.50 standard.
- */
-
-unsigned char bytes[] = { 0x70U, 0x6AU, 0x77U };
-
-int main(void)
-{
- unsigned short x;
-
- x = udf_crc(bytes, sizeof bytes);
- printf("udf_crc: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U);
-
- return 0;
-}
-
-#endif /* defined(TEST) */
-
-/****************************************************************************/
-#if defined(GENERATE)
-
-/*
- * PURPOSE
- * Generate a table for fast 16-bit CRC calculations (any polynomial).
- *
- * DESCRIPTION
- * The ITU-T V.41 polynomial is 010041.
- *
- * HISTORY
- * July 21, 1997 - Andrew E. Mileski
- * Adapted from OSTA-UDF(tm) 1.50 standard.
- */
-
-#include <stdio.h>
-
-int main(int argc, char **argv)
-{
- unsigned long crc, poly;
- int n, i;
-
- /* Get the polynomial */
- sscanf(argv[1], "%lo", &poly);
- if (poly & 0xffff0000U) {
- fprintf(stderr, "polynomial is too large\en");
- exit(1);
- }
-
- printf("/* CRC 0%o */\n", poly);
-
- /* Create a table */
- printf("static unsigned short crc_table[256] = {\n");
- for (n = 0; n < 256; n++) {
- if (n % 8 == 0)
- printf("\t");
- crc = n << 8;
- for (i = 0; i < 8; i++) {
- if (crc & 0x8000U)
- crc = (crc << 1) ^ poly;
- else
- crc <<= 1;
- crc &= 0xFFFFU;
- }
- if (n == 255)
- printf("0x%04xU ", crc);
- else
- printf("0x%04xU, ", crc);
- if (n % 8 == 7)
- printf("\n");
- }
- printf("};\n");
-
- return 0;
-}
-
-#endif /* defined(GENERATE) */
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 8d8643ada19..62dc270c69d 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -39,13 +39,13 @@
static int do_udf_readdir(struct inode *dir, struct file *filp,
filldir_t filldir, void *dirent)
{
- struct udf_fileident_bh fibh;
+ struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
struct fileIdentDesc *fi = NULL;
struct fileIdentDesc cfi;
int block, iblock;
loff_t nf_pos = (filp->f_pos - 1) << 2;
int flen;
- char fname[UDF_NAME_LEN];
+ char *fname = NULL;
char *nameptr;
uint16_t liu;
uint8_t lfi;
@@ -54,23 +54,32 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
kernel_lb_addr eloc;
uint32_t elen;
sector_t offset;
- int i, num;
+ int i, num, ret = 0;
unsigned int dt_type;
struct extent_position epos = { NULL, 0, {0, 0} };
struct udf_inode_info *iinfo;
if (nf_pos >= size)
- return 0;
+ goto out;
+
+ fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ if (!fname) {
+ ret = -ENOMEM;
+ goto out;
+ }
if (nf_pos == 0)
nf_pos = udf_ext0_offset(dir);
fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
iinfo = UDF_I(dir);
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
- fibh.sbh = fibh.ebh = NULL;
- } else if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
- &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) {
+ if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+ if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
+ &epos, &eloc, &elen, &offset)
+ != (EXT_RECORDED_ALLOCATED >> 30)) {
+ ret = -ENOENT;
+ goto out;
+ }
block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -83,8 +92,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
}
if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) {
- brelse(epos.bh);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) {
@@ -105,9 +114,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
brelse(bha[i]);
}
}
- } else {
- brelse(epos.bh);
- return -ENOENT;
}
while (nf_pos < size) {
@@ -115,13 +121,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
&elen, &offset);
- if (!fi) {
- if (fibh.sbh != fibh.ebh)
- brelse(fibh.ebh);
- brelse(fibh.sbh);
- brelse(epos.bh);
- return 0;
- }
+ if (!fi)
+ goto out;
liu = le16_to_cpu(cfi.lengthOfImpUse);
lfi = cfi.lengthFileIdent;
@@ -167,53 +168,23 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
dt_type = DT_UNKNOWN;
}
- if (flen) {
- if (filldir(dirent, fname, flen, filp->f_pos, iblock, dt_type) < 0) {
- if (fibh.sbh != fibh.ebh)
- brelse(fibh.ebh);
- brelse(fibh.sbh);
- brelse(epos.bh);
- return 0;
- }
- }
+ if (flen && filldir(dirent, fname, flen, filp->f_pos,
+ iblock, dt_type) < 0)
+ goto out;
} /* end while */
filp->f_pos = (nf_pos >> 2) + 1;
+out:
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
brelse(epos.bh);
+ kfree(fname);
- return 0;
+ return ret;
}
-/*
- * udf_readdir
- *
- * PURPOSE
- * Read a directory entry.
- *
- * DESCRIPTION
- * Optional - sys_getdents() will return -ENOTDIR if this routine is not
- * available.
- *
- * Refer to sys_getdents() in fs/readdir.c
- * sys_getdents() -> .
- *
- * PRE-CONDITIONS
- * filp Pointer to directory file.
- * buf Pointer to directory entry buffer.
- * filldir Pointer to filldir function.
- *
- * POST-CONDITIONS
- * <return> >=0 on success.
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
-
static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
struct inode *dir = filp->f_path.dentry->d_inode;
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 56387711589..a0974df82b3 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -70,19 +70,6 @@ typedef struct {
uint8_t microseconds;
} __attribute__ ((packed)) timestamp;
-typedef struct {
- uint16_t typeAndTimezone;
- int16_t year;
- uint8_t month;
- uint8_t day;
- uint8_t hour;
- uint8_t minute;
- uint8_t second;
- uint8_t centiseconds;
- uint8_t hundredsOfMicroseconds;
- uint8_t microseconds;
-} __attribute__ ((packed)) kernel_timestamp;
-
/* Type and Time Zone (ECMA 167r3 1/7.3.1) */
#define TIMESTAMP_TYPE_MASK 0xF000
#define TIMESTAMP_TYPE_CUT 0x0000
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 97c71ae7c68..0ed6e146a0d 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -27,7 +27,6 @@
#include "udfdecl.h"
#include <linux/fs.h>
-#include <linux/udf_fs.h>
#include <asm/uaccess.h>
#include <linux/kernel.h>
#include <linux/string.h> /* memset */
@@ -144,40 +143,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return retval;
}
-/*
- * udf_ioctl
- *
- * PURPOSE
- * Issue an ioctl.
- *
- * DESCRIPTION
- * Optional - sys_ioctl() will return -ENOTTY if this routine is not
- * available, and the ioctl cannot be handled without filesystem help.
- *
- * sys_ioctl() handles these ioctls that apply only to regular files:
- * FIBMAP [requires udf_block_map()], FIGETBSZ, FIONREAD
- * These ioctls are also handled by sys_ioctl():
- * FIOCLEX, FIONCLEX, FIONBIO, FIOASYNC
- * All other ioctls are passed to the filesystem.
- *
- * Refer to sys_ioctl() in fs/ioctl.c
- * sys_ioctl() -> .
- *
- * PRE-CONDITIONS
- * inode Pointer to inode that ioctl was issued on.
- * filp Pointer to file that ioctl was issued on.
- * cmd The ioctl command.
- * arg The ioctl argument [can be interpreted as a
- * user-space pointer if desired].
- *
- * POST-CONDITIONS
- * <return> Success (>=0) or an error code (<=0) that
- * sys_ioctl() will return.
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -225,18 +190,6 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
return result;
}
-/*
- * udf_release_file
- *
- * PURPOSE
- * Called when all references to the file are closed
- *
- * DESCRIPTION
- * Discard prealloced blocks
- *
- * HISTORY
- *
- */
static int udf_release_file(struct inode *inode, struct file *filp)
{
if (filp->f_mode & FMODE_WRITE) {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 84360315aca..eb9cfa23dc3 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -21,7 +21,6 @@
#include "udfdecl.h"
#include <linux/fs.h>
#include <linux/quotaops.h>
-#include <linux/udf_fs.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -47,11 +46,9 @@ void udf_free_inode(struct inode *inode)
struct logicalVolIntegrityDescImpUse *lvidiu =
udf_sb_lvidiu(sbi);
if (S_ISDIR(inode->i_mode))
- lvidiu->numDirs =
- cpu_to_le32(le32_to_cpu(lvidiu->numDirs) - 1);
+ le32_add_cpu(&lvidiu->numDirs, -1);
else
- lvidiu->numFiles =
- cpu_to_le32(le32_to_cpu(lvidiu->numFiles) - 1);
+ le32_add_cpu(&lvidiu->numFiles, -1);
mark_buffer_dirty(sbi->s_lvid_bh);
}
@@ -105,11 +102,9 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
lvhd = (struct logicalVolHeaderDesc *)
(lvid->logicalVolContentsUse);
if (S_ISDIR(mode))
- lvidiu->numDirs =
- cpu_to_le32(le32_to_cpu(lvidiu->numDirs) + 1);
+ le32_add_cpu(&lvidiu->numDirs, 1);
else
- lvidiu->numFiles =
- cpu_to_le32(le32_to_cpu(lvidiu->numFiles) + 1);
+ le32_add_cpu(&lvidiu->numFiles, 1);
iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
if (!(++uniqueID & 0x00000000FFFFFFFFUL))
uniqueID += 16;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 24cfa55d0fd..6e74b117aaf 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -37,6 +37,7 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/slab.h>
+#include <linux/crc-itu-t.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -66,22 +67,7 @@ static void udf_update_extents(struct inode *,
struct extent_position *);
static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
-/*
- * udf_delete_inode
- *
- * PURPOSE
- * Clean-up before the specified inode is destroyed.
- *
- * DESCRIPTION
- * This routine is called when the kernel destroys an inode structure
- * ie. when iput() finds i_count == 0.
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- *
- * Called at the last iput() if i_nlink is zero.
- */
+
void udf_delete_inode(struct inode *inode)
{
truncate_inode_pages(&inode->i_data, 0);
@@ -323,9 +309,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
lock_kernel();
- if (block < 0)
- goto abort_negative;
-
iinfo = UDF_I(inode);
if (block == iinfo->i_next_alloc_block + 1) {
iinfo->i_next_alloc_block++;
@@ -347,10 +330,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
abort:
unlock_kernel();
return err;
-
-abort_negative:
- udf_warning(inode->i_sb, "udf_get_block", "block < 0");
- goto abort;
}
static struct buffer_head *udf_getblk(struct inode *inode, long block,
@@ -1116,42 +1095,36 @@ static void __udf_read_inode(struct inode *inode)
fe = (struct fileEntry *)bh->b_data;
if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
- struct buffer_head *ibh = NULL, *nbh = NULL;
- struct indirectEntry *ie;
+ struct buffer_head *ibh;
ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1,
&ident);
- if (ident == TAG_IDENT_IE) {
- if (ibh) {
- kernel_lb_addr loc;
- ie = (struct indirectEntry *)ibh->b_data;
-
- loc = lelb_to_cpu(ie->indirectICB.extLocation);
-
- if (ie->indirectICB.extLength &&
- (nbh = udf_read_ptagged(inode->i_sb, loc, 0,
- &ident))) {
- if (ident == TAG_IDENT_FE ||
- ident == TAG_IDENT_EFE) {
- memcpy(&iinfo->i_location,
- &loc,
- sizeof(kernel_lb_addr));
- brelse(bh);
- brelse(ibh);
- brelse(nbh);
- __udf_read_inode(inode);
- return;
- } else {
- brelse(nbh);
- brelse(ibh);
- }
- } else {
+ if (ident == TAG_IDENT_IE && ibh) {
+ struct buffer_head *nbh = NULL;
+ kernel_lb_addr loc;
+ struct indirectEntry *ie;
+
+ ie = (struct indirectEntry *)ibh->b_data;
+ loc = lelb_to_cpu(ie->indirectICB.extLocation);
+
+ if (ie->indirectICB.extLength &&
+ (nbh = udf_read_ptagged(inode->i_sb, loc, 0,
+ &ident))) {
+ if (ident == TAG_IDENT_FE ||
+ ident == TAG_IDENT_EFE) {
+ memcpy(&iinfo->i_location,
+ &loc,
+ sizeof(kernel_lb_addr));
+ brelse(bh);
brelse(ibh);
+ brelse(nbh);
+ __udf_read_inode(inode);
+ return;
}
+ brelse(nbh);
}
- } else {
- brelse(ibh);
}
+ brelse(ibh);
} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
printk(KERN_ERR "udf: unsupported strategy type: %d\n",
le16_to_cpu(fe->icbTag.strategyType));
@@ -1168,8 +1141,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
{
struct fileEntry *fe;
struct extendedFileEntry *efe;
- time_t convtime;
- long convtime_usec;
int offset;
struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
struct udf_inode_info *iinfo = UDF_I(inode);
@@ -1257,29 +1228,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
(inode->i_sb->s_blocksize_bits - 9);
- if (udf_stamp_to_time(&convtime, &convtime_usec,
- lets_to_cpu(fe->accessTime))) {
- inode->i_atime.tv_sec = convtime;
- inode->i_atime.tv_nsec = convtime_usec * 1000;
- } else {
+ if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime))
inode->i_atime = sbi->s_record_time;
- }
- if (udf_stamp_to_time(&convtime, &convtime_usec,
- lets_to_cpu(fe->modificationTime))) {
- inode->i_mtime.tv_sec = convtime;
- inode->i_mtime.tv_nsec = convtime_usec * 1000;
- } else {
+ if (!udf_disk_stamp_to_time(&inode->i_mtime,
+ fe->modificationTime))
inode->i_mtime = sbi->s_record_time;
- }
- if (udf_stamp_to_time(&convtime, &convtime_usec,
- lets_to_cpu(fe->attrTime))) {
- inode->i_ctime.tv_sec = convtime;
- inode->i_ctime.tv_nsec = convtime_usec * 1000;
- } else {
+ if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime))
inode->i_ctime = sbi->s_record_time;
- }
iinfo->i_unique = le64_to_cpu(fe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1289,37 +1246,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
(inode->i_sb->s_blocksize_bits - 9);
- if (udf_stamp_to_time(&convtime, &convtime_usec,
- lets_to_cpu(efe->accessTime))) {
- inode->i_atime.tv_sec = convtime;
- inode->i_atime.tv_nsec = convtime_usec * 1000;
- } else {
+ if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime))
inode->i_atime = sbi->s_record_time;
- }
- if (udf_stamp_to_time(&convtime, &convtime_usec,
- lets_to_cpu(efe->modificationTime))) {
- inode->i_mtime.tv_sec = convtime;
- inode->i_mtime.tv_nsec = convtime_usec * 1000;
- } else {
+ if (!udf_disk_stamp_to_time(&inode->i_mtime,
+ efe->modificationTime))
inode->i_mtime = sbi->s_record_time;
- }
- if (udf_stamp_to_time(&convtime, &convtime_usec,
- lets_to_cpu(efe->createTime))) {
- iinfo->i_crtime.tv_sec = convtime;
- iinfo->i_crtime.tv_nsec = convtime_usec * 1000;
- } else {
+ if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime))
iinfo->i_crtime = sbi->s_record_time;
- }
- if (udf_stamp_to_time(&convtime, &convtime_usec,
- lets_to_cpu(efe->attrTime))) {
- inode->i_ctime.tv_sec = convtime;
- inode->i_ctime.tv_nsec = convtime_usec * 1000;
- } else {
+ if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime))
inode->i_ctime = sbi->s_record_time;
- }
iinfo->i_unique = le64_to_cpu(efe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
@@ -1338,6 +1276,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
case ICBTAG_FILE_TYPE_REALTIME:
case ICBTAG_FILE_TYPE_REGULAR:
case ICBTAG_FILE_TYPE_UNDEF:
+ case ICBTAG_FILE_TYPE_VAT20:
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
inode->i_data.a_ops = &udf_adinicb_aops;
else
@@ -1363,6 +1302,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
inode->i_op = &page_symlink_inode_operations;
inode->i_mode = S_IFLNK | S_IRWXUGO;
break;
+ case ICBTAG_FILE_TYPE_MAIN:
+ udf_debug("METADATA FILE-----\n");
+ break;
+ case ICBTAG_FILE_TYPE_MIRROR:
+ udf_debug("METADATA MIRROR FILE-----\n");
+ break;
+ case ICBTAG_FILE_TYPE_BITMAP:
+ udf_debug("METADATA BITMAP FILE-----\n");
+ break;
default:
printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown "
"file type=%d\n", inode->i_ino,
@@ -1416,21 +1364,6 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
return mode;
}
-/*
- * udf_write_inode
- *
- * PURPOSE
- * Write out the specified inode.
- *
- * DESCRIPTION
- * This routine is called whenever an inode is synced.
- * Currently this routine is just a placeholder.
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
-
int udf_write_inode(struct inode *inode, int sync)
{
int ret;
@@ -1455,7 +1388,6 @@ static int udf_update_inode(struct inode *inode, int do_sync)
uint32_t udfperms;
uint16_t icbflags;
uint16_t crclen;
- kernel_timestamp cpu_time;
int err = 0;
struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -1488,9 +1420,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
iinfo->i_location.
logicalBlockNum);
use->descTag.descCRCLength = cpu_to_le16(crclen);
- use->descTag.descCRC = cpu_to_le16(udf_crc((char *)use +
- sizeof(tag), crclen,
- 0));
+ use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
+ sizeof(tag),
+ crclen));
use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
mark_buffer_dirty(bh);
@@ -1558,12 +1490,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
(inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
(blocksize_bits - 9));
- if (udf_time_to_stamp(&cpu_time, inode->i_atime))
- fe->accessTime = cpu_to_lets(cpu_time);
- if (udf_time_to_stamp(&cpu_time, inode->i_mtime))
- fe->modificationTime = cpu_to_lets(cpu_time);
- if (udf_time_to_stamp(&cpu_time, inode->i_ctime))
- fe->attrTime = cpu_to_lets(cpu_time);
+ udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
+ udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
+ udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
memset(&(fe->impIdent), 0, sizeof(regid));
strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1598,14 +1527,10 @@ static int udf_update_inode(struct inode *inode, int do_sync)
iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec))
iinfo->i_crtime = inode->i_ctime;
- if (udf_time_to_stamp(&cpu_time, inode->i_atime))
- efe->accessTime = cpu_to_lets(cpu_time);
- if (udf_time_to_stamp(&cpu_time, inode->i_mtime))
- efe->modificationTime = cpu_to_lets(cpu_time);
- if (udf_time_to_stamp(&cpu_time, iinfo->i_crtime))
- efe->createTime = cpu_to_lets(cpu_time);
- if (udf_time_to_stamp(&cpu_time, inode->i_ctime))
- efe->attrTime = cpu_to_lets(cpu_time);
+ udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
+ udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
+ udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
+ udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
memset(&(efe->impIdent), 0, sizeof(regid));
strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
@@ -1660,8 +1585,8 @@ static int udf_update_inode(struct inode *inode, int do_sync)
crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
sizeof(tag);
fe->descTag.descCRCLength = cpu_to_le16(crclen);
- fe->descTag.descCRC = cpu_to_le16(udf_crc((char *)fe + sizeof(tag),
- crclen, 0));
+ fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag),
+ crclen));
fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
/* write the data blocks */
@@ -1778,9 +1703,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
if (epos->bh) {
aed = (struct allocExtDesc *)epos->bh->b_data;
- aed->lengthAllocDescs =
- cpu_to_le32(le32_to_cpu(
- aed->lengthAllocDescs) + adsize);
+ le32_add_cpu(&aed->lengthAllocDescs, adsize);
} else {
iinfo->i_lenAlloc += adsize;
mark_inode_dirty(inode);
@@ -1830,9 +1753,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
mark_inode_dirty(inode);
} else {
aed = (struct allocExtDesc *)epos->bh->b_data;
- aed->lengthAllocDescs =
- cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) +
- adsize);
+ le32_add_cpu(&aed->lengthAllocDescs, adsize);
if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
udf_update_tag(epos->bh->b_data,
@@ -2046,9 +1967,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
mark_inode_dirty(inode);
} else {
aed = (struct allocExtDesc *)oepos.bh->b_data;
- aed->lengthAllocDescs =
- cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) -
- (2 * adsize));
+ le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize));
if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
udf_update_tag(oepos.bh->b_data,
@@ -2065,9 +1984,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
mark_inode_dirty(inode);
} else {
aed = (struct allocExtDesc *)oepos.bh->b_data;
- aed->lengthAllocDescs =
- cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) -
- adsize);
+ le32_add_cpu(&aed->lengthAllocDescs, -adsize);
if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
udf_update_tag(oepos.bh->b_data,
@@ -2095,11 +2012,6 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
int8_t etype;
struct udf_inode_info *iinfo;
- if (block < 0) {
- printk(KERN_ERR "udf: inode_bmap: block < 0\n");
- return -1;
- }
-
iinfo = UDF_I(inode);
pos->offset = 0;
pos->block = iinfo->i_location;
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 579bae71e67..703843f30ff 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -23,7 +23,6 @@
#include <linux/cdrom.h>
#include <asm/uaccess.h>
-#include <linux/udf_fs.h>
#include "udf_sb.h"
unsigned int udf_get_last_session(struct super_block *sb)
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index a1d6da0caf7..84bf0fd4a4f 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -23,8 +23,8 @@
#include <linux/fs.h>
#include <linux/string.h>
-#include <linux/udf_fs.h>
#include <linux/buffer_head.h>
+#include <linux/crc-itu-t.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -136,8 +136,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
/* rewrite CRC + checksum of eahd */
crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag);
eahd->descTag.descCRCLength = cpu_to_le16(crclen);
- eahd->descTag.descCRC = cpu_to_le16(udf_crc((char *)eahd +
- sizeof(tag), crclen, 0));
+ eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
+ sizeof(tag), crclen));
eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
iinfo->i_lenEAttr += size;
return (struct genericFormat *)&ea[offset];
@@ -204,16 +204,15 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
{
tag *tag_p;
struct buffer_head *bh = NULL;
- struct udf_sb_info *sbi = UDF_SB(sb);
/* Read the block */
if (block == 0xFFFFFFFF)
return NULL;
- bh = udf_tread(sb, block + sbi->s_session);
+ bh = udf_tread(sb, block);
if (!bh) {
udf_debug("block=%d, location=%d: read failed\n",
- block + sbi->s_session, location);
+ block, location);
return NULL;
}
@@ -223,8 +222,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
if (location != le32_to_cpu(tag_p->tagLocation)) {
udf_debug("location mismatch block %u, tag %u != %u\n",
- block + sbi->s_session,
- le32_to_cpu(tag_p->tagLocation), location);
+ block, le32_to_cpu(tag_p->tagLocation), location);
goto error_out;
}
@@ -244,13 +242,13 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
/* Verify the descriptor CRC */
if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize ||
- le16_to_cpu(tag_p->descCRC) == udf_crc(bh->b_data + sizeof(tag),
- le16_to_cpu(tag_p->descCRCLength), 0))
+ le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
+ bh->b_data + sizeof(tag),
+ le16_to_cpu(tag_p->descCRCLength)))
return bh;
- udf_debug("Crc failure block %d: crc = %d, crclen = %d\n",
- block + sbi->s_session, le16_to_cpu(tag_p->descCRC),
- le16_to_cpu(tag_p->descCRCLength));
+ udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
+ le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength));
error_out:
brelse(bh);
@@ -270,7 +268,7 @@ void udf_update_tag(char *data, int length)
length -= sizeof(tag);
tptr->descCRCLength = cpu_to_le16(length);
- tptr->descCRC = cpu_to_le16(udf_crc(data + sizeof(tag), length, 0));
+ tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length));
tptr->tagChecksum = udf_tag_checksum(tptr);
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 112a5fb0b27..ba5537d4bc1 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -31,6 +31,7 @@
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/sched.h>
+#include <linux/crc-itu-t.h>
static inline int udf_match(int len1, const char *name1, int len2,
const char *name2)
@@ -97,25 +98,23 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
memset(fibh->ebh->b_data, 0x00, padlen + offset);
}
- crc = udf_crc((uint8_t *)cfi + sizeof(tag),
- sizeof(struct fileIdentDesc) - sizeof(tag), 0);
+ crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag),
+ sizeof(struct fileIdentDesc) - sizeof(tag));
if (fibh->sbh == fibh->ebh) {
- crc = udf_crc((uint8_t *)sfi->impUse,
+ crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
crclen + sizeof(tag) -
- sizeof(struct fileIdentDesc), crc);
+ sizeof(struct fileIdentDesc));
} else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
- crc = udf_crc(fibh->ebh->b_data +
+ crc = crc_itu_t(crc, fibh->ebh->b_data +
sizeof(struct fileIdentDesc) +
fibh->soffset,
crclen + sizeof(tag) -
- sizeof(struct fileIdentDesc),
- crc);
+ sizeof(struct fileIdentDesc));
} else {
- crc = udf_crc((uint8_t *)sfi->impUse,
- -fibh->soffset - sizeof(struct fileIdentDesc),
- crc);
- crc = udf_crc(fibh->ebh->b_data, fibh->eoffset, crc);
+ crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
+ -fibh->soffset - sizeof(struct fileIdentDesc));
+ crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset);
}
cfi->descTag.descCRC = cpu_to_le16(crc);
@@ -149,7 +148,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
struct fileIdentDesc *fi = NULL;
loff_t f_pos;
int block, flen;
- char fname[UDF_NAME_LEN];
+ char *fname = NULL;
char *nameptr;
uint8_t lfi;
uint16_t liu;
@@ -163,12 +162,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
size = udf_ext0_offset(dir) + dir->i_size;
f_pos = udf_ext0_offset(dir);
+ fibh->sbh = fibh->ebh = NULL;
fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
- if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
- fibh->sbh = fibh->ebh = NULL;
- else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
- &epos, &eloc, &elen, &offset) ==
- (EXT_RECORDED_ALLOCATED >> 30)) {
+ if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+ if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
+ &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
+ goto out_err;
block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -179,25 +178,19 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
offset = 0;
fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
- if (!fibh->sbh) {
- brelse(epos.bh);
- return NULL;
- }
- } else {
- brelse(epos.bh);
- return NULL;
+ if (!fibh->sbh)
+ goto out_err;
}
+ fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ if (!fname)
+ goto out_err;
+
while (f_pos < size) {
fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc,
&elen, &offset);
- if (!fi) {
- if (fibh->sbh != fibh->ebh)
- brelse(fibh->ebh);
- brelse(fibh->sbh);
- brelse(epos.bh);
- return NULL;
- }
+ if (!fi)
+ goto out_err;
liu = le16_to_cpu(cfi->lengthOfImpUse);
lfi = cfi->lengthFileIdent;
@@ -237,53 +230,22 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
if (flen && udf_match(flen, fname, dentry->d_name.len,
- dentry->d_name.name)) {
- brelse(epos.bh);
- return fi;
- }
+ dentry->d_name.name))
+ goto out_ok;
}
+out_err:
+ fi = NULL;
if (fibh->sbh != fibh->ebh)
brelse(fibh->ebh);
brelse(fibh->sbh);
+out_ok:
brelse(epos.bh);
+ kfree(fname);
- return NULL;
+ return fi;
}
-/*
- * udf_lookup
- *
- * PURPOSE
- * Look-up the inode for a given name.
- *
- * DESCRIPTION
- * Required - lookup_dentry() will return -ENOTDIR if this routine is not
- * available for a directory. The filesystem is useless if this routine is
- * not available for at least the filesystem's root directory.
- *
- * This routine is passed an incomplete dentry - it must be completed by
- * calling d_add(dentry, inode). If the name does not exist, then the
- * specified inode must be set to null. An error should only be returned
- * when the lookup fails for a reason other than the name not existing.
- * Note that the directory inode semaphore is held during the call.
- *
- * Refer to lookup_dentry() in fs/namei.c
- * lookup_dentry() -> lookup() -> real_lookup() -> .
- *
- * PRE-CONDITIONS
- * dir Pointer to inode of parent directory.
- * dentry Pointer to dentry to complete.
- * nd Pointer to lookup nameidata
- *
- * POST-CONDITIONS
- * <return> Zero on success.
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
-
static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
@@ -336,11 +298,9 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
{
struct super_block *sb = dir->i_sb;
struct fileIdentDesc *fi = NULL;
- char name[UDF_NAME_LEN], fname[UDF_NAME_LEN];
+ char *name = NULL;
int namelen;
loff_t f_pos;
- int flen;
- char *nameptr;
loff_t size = udf_ext0_offset(dir) + dir->i_size;
int nfidlen;
uint8_t lfi;
@@ -352,16 +312,23 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
struct extent_position epos = {};
struct udf_inode_info *dinfo;
+ fibh->sbh = fibh->ebh = NULL;
+ name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ if (!name) {
+ *err = -ENOMEM;
+ goto out_err;
+ }
+
if (dentry) {
if (!dentry->d_name.len) {
*err = -EINVAL;
- return NULL;
+ goto out_err;
}
namelen = udf_put_filename(sb, dentry->d_name.name, name,
dentry->d_name.len);
if (!namelen) {
*err = -ENAMETOOLONG;
- return NULL;
+ goto out_err;
}
} else {
namelen = 0;
@@ -373,11 +340,14 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
dinfo = UDF_I(dir);
- if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
- fibh->sbh = fibh->ebh = NULL;
- else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
- &epos, &eloc, &elen, &offset) ==
- (EXT_RECORDED_ALLOCATED >> 30)) {
+ if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+ if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
+ &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
+ block = udf_get_lb_pblock(dir->i_sb,
+ dinfo->i_location, 0);
+ fibh->soffset = fibh->eoffset = sb->s_blocksize;
+ goto add;
+ }
block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -389,17 +359,11 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
if (!fibh->sbh) {
- brelse(epos.bh);
*err = -EIO;
- return NULL;
+ goto out_err;
}
block = dinfo->i_location.logicalBlockNum;
- } else {
- block = udf_get_lb_pblock(dir->i_sb, dinfo->i_location, 0);
- fibh->sbh = fibh->ebh = NULL;
- fibh->soffset = fibh->eoffset = sb->s_blocksize;
- goto add;
}
while (f_pos < size) {
@@ -407,41 +371,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
&elen, &offset);
if (!fi) {
- if (fibh->sbh != fibh->ebh)
- brelse(fibh->ebh);
- brelse(fibh->sbh);
- brelse(epos.bh);
*err = -EIO;
- return NULL;
+ goto out_err;
}
liu = le16_to_cpu(cfi->lengthOfImpUse);
lfi = cfi->lengthFileIdent;
- if (fibh->sbh == fibh->ebh)
- nameptr = fi->fileIdent + liu;
- else {
- int poffset; /* Unpaded ending offset */
-
- poffset = fibh->soffset + sizeof(struct fileIdentDesc) +
- liu + lfi;
-
- if (poffset >= lfi)
- nameptr = (char *)(fibh->ebh->b_data +
- poffset - lfi);
- else {
- nameptr = fname;
- memcpy(nameptr, fi->fileIdent + liu,
- lfi - poffset);
- memcpy(nameptr + lfi - poffset,
- fibh->ebh->b_data, poffset);
- }
- }
-
if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
if (((sizeof(struct fileIdentDesc) +
liu + lfi + 3) & ~3) == nfidlen) {
- brelse(epos.bh);
cfi->descTag.tagSerialNum = cpu_to_le16(1);
cfi->fileVersionNum = cpu_to_le16(1);
cfi->fileCharacteristics = 0;
@@ -449,27 +388,13 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
cfi->lengthOfImpUse = cpu_to_le16(0);
if (!udf_write_fi(dir, cfi, fi, fibh, NULL,
name))
- return fi;
+ goto out_ok;
else {
*err = -EIO;
- return NULL;
+ goto out_err;
}
}
}
-
- if (!lfi || !dentry)
- continue;
-
- flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
- if (flen && udf_match(flen, fname, dentry->d_name.len,
- dentry->d_name.name)) {
- if (fibh->sbh != fibh->ebh)
- brelse(fibh->ebh);
- brelse(fibh->sbh);
- brelse(epos.bh);
- *err = -EEXIST;
- return NULL;
- }
}
add:
@@ -496,7 +421,7 @@ add:
fibh->sbh = fibh->ebh =
udf_expand_dir_adinicb(dir, &block, err);
if (!fibh->sbh)
- return NULL;
+ goto out_err;
epos.block = dinfo->i_location;
epos.offset = udf_file_entry_alloc_offset(dir);
/* Load extent udf_expand_dir_adinicb() has created */
@@ -537,11 +462,8 @@ add:
dir->i_sb->s_blocksize_bits);
fibh->ebh = udf_bread(dir,
f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
- if (!fibh->ebh) {
- brelse(epos.bh);
- brelse(fibh->sbh);
- return NULL;
- }
+ if (!fibh->ebh)
+ goto out_err;
if (!fibh->soffset) {
if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
@@ -572,20 +494,25 @@ add:
cfi->lengthFileIdent = namelen;
cfi->lengthOfImpUse = cpu_to_le16(0);
if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) {
- brelse(epos.bh);
dir->i_size += nfidlen;
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
dinfo->i_lenAlloc += nfidlen;
mark_inode_dirty(dir);
- return fi;
+ goto out_ok;
} else {
- brelse(epos.bh);
- if (fibh->sbh != fibh->ebh)
- brelse(fibh->ebh);
- brelse(fibh->sbh);
*err = -EIO;
- return NULL;
+ goto out_err;
}
+
+out_err:
+ fi = NULL;
+ if (fibh->sbh != fibh->ebh)
+ brelse(fibh->ebh);
+ brelse(fibh->sbh);
+out_ok:
+ brelse(epos.bh);
+ kfree(name);
+ return fi;
}
static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
@@ -940,7 +867,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
char *ea;
int err;
int block;
- char name[UDF_NAME_LEN];
+ char *name = NULL;
int namelen;
struct buffer_head *bh;
struct udf_inode_info *iinfo;
@@ -950,6 +877,12 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
if (!inode)
goto out;
+ name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ goto out_no_entry;
+ }
+
iinfo = UDF_I(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_data.a_ops = &udf_symlink_aops;
@@ -1089,6 +1022,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
err = 0;
out:
+ kfree(name);
unlock_kernel();
return err;
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index fc533345ab8..63610f026ae 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
#include <linux/fs.h>
#include <linux/string.h>
-#include <linux/udf_fs.h>
#include <linux/slab.h>
#include <linux/buffer_head.h>
@@ -55,11 +54,10 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map;
struct udf_virtual_data *vdata;
- struct udf_inode_info *iinfo;
+ struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode);
map = &sbi->s_partmaps[partition];
vdata = &map->s_type_specific.s_virtual;
- index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
if (block > vdata->s_num_entries) {
udf_debug("Trying to access block beyond end of VAT "
@@ -67,6 +65,12 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
return 0xFFFFFFFF;
}
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+ loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data +
+ vdata->s_start_offset))[block]);
+ goto translate;
+ }
+ index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
if (block >= index) {
block -= index;
newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t)));
@@ -89,7 +93,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
brelse(bh);
- iinfo = UDF_I(sbi->s_vat_inode);
+translate:
if (iinfo->i_location.partitionReferenceNum == partition) {
udf_debug("recursive call to udf_get_pblock!\n");
return 0xFFFFFFFF;
@@ -263,3 +267,58 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
return 0;
}
+
+static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
+ uint16_t partition, uint32_t offset)
+{
+ struct super_block *sb = inode->i_sb;
+ struct udf_part_map *map;
+ kernel_lb_addr eloc;
+ uint32_t elen;
+ sector_t ext_offset;
+ struct extent_position epos = {};
+ uint32_t phyblock;
+
+ if (inode_bmap(inode, block, &epos, &eloc, &elen, &ext_offset) !=
+ (EXT_RECORDED_ALLOCATED >> 30))
+ phyblock = 0xFFFFFFFF;
+ else {
+ map = &UDF_SB(sb)->s_partmaps[partition];
+ /* map to sparable/physical partition desc */
+ phyblock = udf_get_pblock(sb, eloc.logicalBlockNum,
+ map->s_partition_num, ext_offset + offset);
+ }
+
+ brelse(epos.bh);
+ return phyblock;
+}
+
+uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
+ uint16_t partition, uint32_t offset)
+{
+ struct udf_sb_info *sbi = UDF_SB(sb);
+ struct udf_part_map *map;
+ struct udf_meta_data *mdata;
+ uint32_t retblk;
+ struct inode *inode;
+
+ udf_debug("READING from METADATA\n");
+
+ map = &sbi->s_partmaps[partition];
+ mdata = &map->s_type_specific.s_metadata;
+ inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe;
+
+ /* We shouldn't mount such media... */
+ BUG_ON(!inode);
+ retblk = udf_try_read_meta(inode, block, partition, offset);
+ if (retblk == 0xFFFFFFFF) {
+ udf_warning(sb, __func__, "error reading from METADATA, "
+ "trying to read from MIRROR");
+ inode = mdata->s_mirror_fe;
+ if (!inode)
+ return 0xFFFFFFFF;
+ retblk = udf_try_read_meta(inode, block, partition, offset);
+ }
+
+ return retblk;
+}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f3ac4abfc94..b564fc140fe 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -55,9 +55,10 @@
#include <linux/errno.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
+#include <linux/bitmap.h>
+#include <linux/crc-itu-t.h>
#include <asm/byteorder.h>
-#include <linux/udf_fs.h>
#include "udf_sb.h"
#include "udf_i.h"
@@ -84,22 +85,19 @@ static void udf_write_super(struct super_block *);
static int udf_remount_fs(struct super_block *, int *, char *);
static int udf_check_valid(struct super_block *, int, int);
static int udf_vrs(struct super_block *sb, int silent);
-static int udf_load_partition(struct super_block *, kernel_lb_addr *);
-static int udf_load_logicalvol(struct super_block *, struct buffer_head *,
- kernel_lb_addr *);
static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad);
static void udf_find_anchor(struct super_block *);
static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
kernel_lb_addr *);
-static void udf_load_pvoldesc(struct super_block *, struct buffer_head *);
static void udf_load_fileset(struct super_block *, struct buffer_head *,
kernel_lb_addr *);
-static int udf_load_partdesc(struct super_block *, struct buffer_head *);
static void udf_open_lvid(struct super_block *);
static void udf_close_lvid(struct super_block *);
static unsigned int udf_count_free(struct super_block *);
static int udf_statfs(struct dentry *, struct kstatfs *);
static int udf_show_options(struct seq_file *, struct vfsmount *);
+static void udf_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
{
@@ -587,48 +585,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
return 0;
}
-/*
- * udf_set_blocksize
- *
- * PURPOSE
- * Set the block size to be used in all transfers.
- *
- * DESCRIPTION
- * To allow room for a DMA transfer, it is best to guess big when unsure.
- * This routine picks 2048 bytes as the blocksize when guessing. This
- * should be adequate until devices with larger block sizes become common.
- *
- * Note that the Linux kernel can currently only deal with blocksizes of
- * 512, 1024, 2048, 4096, and 8192 bytes.
- *
- * PRE-CONDITIONS
- * sb Pointer to _locked_ superblock.
- *
- * POST-CONDITIONS
- * sb->s_blocksize Blocksize.
- * sb->s_blocksize_bits log2 of blocksize.
- * <return> 0 Blocksize is valid.
- * <return> 1 Blocksize is invalid.
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
-static int udf_set_blocksize(struct super_block *sb, int bsize)
-{
- if (!sb_min_blocksize(sb, bsize)) {
- udf_debug("Bad block size (%d)\n", bsize);
- printk(KERN_ERR "udf: bad block size (%d)\n", bsize);
- return 0;
- }
-
- return sb->s_blocksize;
-}
-
static int udf_vrs(struct super_block *sb, int silent)
{
struct volStructDesc *vsd = NULL;
- int sector = 32768;
+ loff_t sector = 32768;
int sectorsize;
struct buffer_head *bh = NULL;
int iso9660 = 0;
@@ -649,7 +609,8 @@ static int udf_vrs(struct super_block *sb, int silent)
sector += (sbi->s_session << sb->s_blocksize_bits);
udf_debug("Starting at sector %u (%ld byte sectors)\n",
- (sector >> sb->s_blocksize_bits), sb->s_blocksize);
+ (unsigned int)(sector >> sb->s_blocksize_bits),
+ sb->s_blocksize);
/* Process the sequence (if applicable) */
for (; !nsr02 && !nsr03; sector += sectorsize) {
/* Read a block */
@@ -719,162 +680,140 @@ static int udf_vrs(struct super_block *sb, int silent)
}
/*
- * udf_find_anchor
- *
- * PURPOSE
- * Find an anchor volume descriptor.
- *
- * PRE-CONDITIONS
- * sb Pointer to _locked_ superblock.
- * lastblock Last block on media.
- *
- * POST-CONDITIONS
- * <return> 1 if not found, 0 if ok
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
+ * Check whether there is an anchor block in the given block
*/
-static void udf_find_anchor(struct super_block *sb)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+ bool varconv)
{
- int lastblock;
struct buffer_head *bh = NULL;
+ tag *t;
uint16_t ident;
uint32_t location;
- int i;
- struct udf_sb_info *sbi;
- sbi = UDF_SB(sb);
- lastblock = sbi->s_last_block;
+ if (varconv) {
+ if (udf_fixed_to_variable(block) >=
+ sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+ return 0;
+ bh = sb_bread(sb, udf_fixed_to_variable(block));
+ }
+ else
+ bh = sb_bread(sb, block);
- if (lastblock) {
- int varlastblock = udf_variable_to_fixed(lastblock);
- int last[] = { lastblock, lastblock - 2,
- lastblock - 150, lastblock - 152,
- varlastblock, varlastblock - 2,
- varlastblock - 150, varlastblock - 152 };
-
- lastblock = 0;
-
- /* Search for an anchor volume descriptor pointer */
-
- /* according to spec, anchor is in either:
- * block 256
- * lastblock-256
- * lastblock
- * however, if the disc isn't closed, it could be 512 */
-
- for (i = 0; !lastblock && i < ARRAY_SIZE(last); i++) {
- ident = location = 0;
- if (last[i] >= 0) {
- bh = sb_bread(sb, last[i]);
- if (bh) {
- tag *t = (tag *)bh->b_data;
- ident = le16_to_cpu(t->tagIdent);
- location = le32_to_cpu(t->tagLocation);
- brelse(bh);
- }
- }
+ if (!bh)
+ return 0;
- if (ident == TAG_IDENT_AVDP) {
- if (location == last[i] - sbi->s_session) {
- lastblock = last[i] - sbi->s_session;
- sbi->s_anchor[0] = lastblock;
- sbi->s_anchor[1] = lastblock - 256;
- } else if (location ==
- udf_variable_to_fixed(last[i]) -
- sbi->s_session) {
- UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
- lastblock =
- udf_variable_to_fixed(last[i]) -
- sbi->s_session;
- sbi->s_anchor[0] = lastblock;
- sbi->s_anchor[1] = lastblock - 256 -
- sbi->s_session;
- } else {
- udf_debug("Anchor found at block %d, "
- "location mismatch %d.\n",
- last[i], location);
- }
- } else if (ident == TAG_IDENT_FE ||
- ident == TAG_IDENT_EFE) {
- lastblock = last[i];
- sbi->s_anchor[3] = 512;
- } else {
- ident = location = 0;
- if (last[i] >= 256) {
- bh = sb_bread(sb, last[i] - 256);
- if (bh) {
- tag *t = (tag *)bh->b_data;
- ident = le16_to_cpu(
- t->tagIdent);
- location = le32_to_cpu(
- t->tagLocation);
- brelse(bh);
- }
- }
+ t = (tag *)bh->b_data;
+ ident = le16_to_cpu(t->tagIdent);
+ location = le32_to_cpu(t->tagLocation);
+ brelse(bh);
+ if (ident != TAG_IDENT_AVDP)
+ return 0;
+ return location == block;
+}
- if (ident == TAG_IDENT_AVDP &&
- location == last[i] - 256 -
- sbi->s_session) {
- lastblock = last[i];
- sbi->s_anchor[1] = last[i] - 256;
- } else {
- ident = location = 0;
- if (last[i] >= 312 + sbi->s_session) {
- bh = sb_bread(sb,
- last[i] - 312 -
- sbi->s_session);
- if (bh) {
- tag *t = (tag *)
- bh->b_data;
- ident = le16_to_cpu(
- t->tagIdent);
- location = le32_to_cpu(
- t->tagLocation);
- brelse(bh);
- }
- }
+/* Search for an anchor volume descriptor pointer */
+static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
+ sector_t lastblock)
+{
+ sector_t last[6];
+ int i;
+ struct udf_sb_info *sbi = UDF_SB(sb);
- if (ident == TAG_IDENT_AVDP &&
- location == udf_variable_to_fixed(last[i]) - 256) {
- UDF_SET_FLAG(sb,
- UDF_FLAG_VARCONV);
- lastblock = udf_variable_to_fixed(last[i]);
- sbi->s_anchor[1] = lastblock - 256;
- }
- }
- }
+ last[0] = lastblock;
+ last[1] = last[0] - 1;
+ last[2] = last[0] + 1;
+ last[3] = last[0] - 2;
+ last[4] = last[0] - 150;
+ last[5] = last[0] - 152;
+
+ /* according to spec, anchor is in either:
+ * block 256
+ * lastblock-256
+ * lastblock
+ * however, if the disc isn't closed, it could be 512 */
+
+ for (i = 0; i < ARRAY_SIZE(last); i++) {
+ if (last[i] < 0)
+ continue;
+ if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+ sb->s_blocksize_bits)
+ continue;
+
+ if (udf_check_anchor_block(sb, last[i], varconv)) {
+ sbi->s_anchor[0] = last[i];
+ sbi->s_anchor[1] = last[i] - 256;
+ return last[i];
}
- }
- if (!lastblock) {
- /* We haven't found the lastblock. check 312 */
- bh = sb_bread(sb, 312 + sbi->s_session);
- if (bh) {
- tag *t = (tag *)bh->b_data;
- ident = le16_to_cpu(t->tagIdent);
- location = le32_to_cpu(t->tagLocation);
- brelse(bh);
+ if (last[i] < 256)
+ continue;
- if (ident == TAG_IDENT_AVDP && location == 256)
- UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+ if (udf_check_anchor_block(sb, last[i] - 256, varconv)) {
+ sbi->s_anchor[1] = last[i] - 256;
+ return last[i];
}
}
+ if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) {
+ sbi->s_anchor[0] = sbi->s_session + 256;
+ return last[0];
+ }
+ if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) {
+ sbi->s_anchor[0] = sbi->s_session + 512;
+ return last[0];
+ }
+ return 0;
+}
+
+/*
+ * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
+ * be the last block on the media.
+ *
+ * Return 1 if not found, 0 if ok
+ *
+ */
+static void udf_find_anchor(struct super_block *sb)
+{
+ sector_t lastblock;
+ struct buffer_head *bh = NULL;
+ uint16_t ident;
+ int i;
+ struct udf_sb_info *sbi = UDF_SB(sb);
+
+ lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block);
+ if (lastblock)
+ goto check_anchor;
+
+ /* No anchor found? Try VARCONV conversion of block numbers */
+ /* Firstly, we try to not convert number of the last block */
+ lastblock = udf_scan_anchors(sb, 1,
+ udf_variable_to_fixed(sbi->s_last_block));
+ if (lastblock) {
+ UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+ goto check_anchor;
+ }
+
+ /* Secondly, we try with converted number of the last block */
+ lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block);
+ if (lastblock)
+ UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+
+check_anchor:
+ /*
+ * Check located anchors and the anchor block supplied via
+ * mount options
+ */
for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
- if (sbi->s_anchor[i]) {
- bh = udf_read_tagged(sb, sbi->s_anchor[i],
- sbi->s_anchor[i], &ident);
- if (!bh)
+ if (!sbi->s_anchor[i])
+ continue;
+ bh = udf_read_tagged(sb, sbi->s_anchor[i],
+ sbi->s_anchor[i], &ident);
+ if (!bh)
+ sbi->s_anchor[i] = 0;
+ else {
+ brelse(bh);
+ if (ident != TAG_IDENT_AVDP)
sbi->s_anchor[i] = 0;
- else {
- brelse(bh);
- if ((ident != TAG_IDENT_AVDP) &&
- (i || (ident != TAG_IDENT_FE &&
- ident != TAG_IDENT_EFE)))
- sbi->s_anchor[i] = 0;
- }
}
}
@@ -971,27 +910,30 @@ static int udf_find_fileset(struct super_block *sb,
return 1;
}
-static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh)
+static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
{
struct primaryVolDesc *pvoldesc;
- time_t recording;
- long recording_usec;
struct ustr instr;
struct ustr outstr;
+ struct buffer_head *bh;
+ uint16_t ident;
+
+ bh = udf_read_tagged(sb, block, block, &ident);
+ if (!bh)
+ return 1;
+ BUG_ON(ident != TAG_IDENT_PVD);
pvoldesc = (struct primaryVolDesc *)bh->b_data;
- if (udf_stamp_to_time(&recording, &recording_usec,
- lets_to_cpu(pvoldesc->recordingDateAndTime))) {
- kernel_timestamp ts;
- ts = lets_to_cpu(pvoldesc->recordingDateAndTime);
- udf_debug("recording time %ld/%ld, %04u/%02u/%02u"
+ if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
+ pvoldesc->recordingDateAndTime)) {
+#ifdef UDFFS_DEBUG
+ timestamp *ts = &pvoldesc->recordingDateAndTime;
+ udf_debug("recording time %04u/%02u/%02u"
" %02u:%02u (%x)\n",
- recording, recording_usec,
- ts.year, ts.month, ts.day, ts.hour,
- ts.minute, ts.typeAndTimezone);
- UDF_SB(sb)->s_record_time.tv_sec = recording;
- UDF_SB(sb)->s_record_time.tv_nsec = recording_usec * 1000;
+ le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
+ ts->minute, le16_to_cpu(ts->typeAndTimezone));
+#endif
}
if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32))
@@ -1005,6 +947,104 @@ static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh)
if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128))
if (udf_CS0toUTF8(&outstr, &instr))
udf_debug("volSetIdent[] = '%s'\n", outstr.u_name);
+
+ brelse(bh);
+ return 0;
+}
+
+static int udf_load_metadata_files(struct super_block *sb, int partition)
+{
+ struct udf_sb_info *sbi = UDF_SB(sb);
+ struct udf_part_map *map;
+ struct udf_meta_data *mdata;
+ kernel_lb_addr addr;
+ int fe_error = 0;
+
+ map = &sbi->s_partmaps[partition];
+ mdata = &map->s_type_specific.s_metadata;
+
+ /* metadata address */
+ addr.logicalBlockNum = mdata->s_meta_file_loc;
+ addr.partitionReferenceNum = map->s_partition_num;
+
+ udf_debug("Metadata file location: block = %d part = %d\n",
+ addr.logicalBlockNum, addr.partitionReferenceNum);
+
+ mdata->s_metadata_fe = udf_iget(sb, addr);
+
+ if (mdata->s_metadata_fe == NULL) {
+ udf_warning(sb, __func__, "metadata inode efe not found, "
+ "will try mirror inode.");
+ fe_error = 1;
+ } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
+ ICBTAG_FLAG_AD_SHORT) {
+ udf_warning(sb, __func__, "metadata inode efe does not have "
+ "short allocation descriptors!");
+ fe_error = 1;
+ iput(mdata->s_metadata_fe);
+ mdata->s_metadata_fe = NULL;
+ }
+
+ /* mirror file entry */
+ addr.logicalBlockNum = mdata->s_mirror_file_loc;
+ addr.partitionReferenceNum = map->s_partition_num;
+
+ udf_debug("Mirror metadata file location: block = %d part = %d\n",
+ addr.logicalBlockNum, addr.partitionReferenceNum);
+
+ mdata->s_mirror_fe = udf_iget(sb, addr);
+
+ if (mdata->s_mirror_fe == NULL) {
+ if (fe_error) {
+ udf_error(sb, __func__, "mirror inode efe not found "
+ "and metadata inode is missing too, exiting...");
+ goto error_exit;
+ } else
+ udf_warning(sb, __func__, "mirror inode efe not found,"
+ " but metadata inode is OK");
+ } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
+ ICBTAG_FLAG_AD_SHORT) {
+ udf_warning(sb, __func__, "mirror inode efe does not have "
+ "short allocation descriptors!");
+ iput(mdata->s_mirror_fe);
+ mdata->s_mirror_fe = NULL;
+ if (fe_error)
+ goto error_exit;
+ }
+
+ /*
+ * bitmap file entry
+ * Note:
+ * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102)
+ */
+ if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) {
+ addr.logicalBlockNum = mdata->s_bitmap_file_loc;
+ addr.partitionReferenceNum = map->s_partition_num;
+
+ udf_debug("Bitmap file location: block = %d part = %d\n",
+ addr.logicalBlockNum, addr.partitionReferenceNum);
+
+ mdata->s_bitmap_fe = udf_iget(sb, addr);
+
+ if (mdata->s_bitmap_fe == NULL) {
+ if (sb->s_flags & MS_RDONLY)
+ udf_warning(sb, __func__, "bitmap inode efe "
+ "not found but it's ok since the disc"
+ " is mounted read-only");
+ else {
+ udf_error(sb, __func__, "bitmap inode efe not "
+ "found and attempted read-write mount");
+ goto error_exit;
+ }
+ }
+ }
+
+ udf_debug("udf_load_metadata_files Ok\n");
+
+ return 0;
+
+error_exit:
+ return 1;
}
static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
@@ -1025,10 +1065,9 @@ static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
int udf_compute_nr_groups(struct super_block *sb, u32 partition)
{
struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
- return (map->s_partition_len +
- (sizeof(struct spaceBitmapDesc) << 3) +
- (sb->s_blocksize * 8) - 1) /
- (sb->s_blocksize * 8);
+ return DIV_ROUND_UP(map->s_partition_len +
+ (sizeof(struct spaceBitmapDesc) << 3),
+ sb->s_blocksize * 8);
}
static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
@@ -1059,134 +1098,241 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
return bitmap;
}
-static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
+static int udf_fill_partdesc_info(struct super_block *sb,
+ struct partitionDesc *p, int p_index)
+{
+ struct udf_part_map *map;
+ struct udf_sb_info *sbi = UDF_SB(sb);
+ struct partitionHeaderDesc *phd;
+
+ map = &sbi->s_partmaps[p_index];
+
+ map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */
+ map->s_partition_root = le32_to_cpu(p->partitionStartingLocation);
+
+ if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
+ map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY;
+ if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE))
+ map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE;
+ if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE))
+ map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE;
+ if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
+ map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
+
+ udf_debug("Partition (%d type %x) starts at physical %d, "
+ "block length %d\n", p_index,
+ map->s_partition_type, map->s_partition_root,
+ map->s_partition_len);
+
+ if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
+ strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+ return 0;
+
+ phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
+ if (phd->unallocSpaceTable.extLength) {
+ kernel_lb_addr loc = {
+ .logicalBlockNum = le32_to_cpu(
+ phd->unallocSpaceTable.extPosition),
+ .partitionReferenceNum = p_index,
+ };
+
+ map->s_uspace.s_table = udf_iget(sb, loc);
+ if (!map->s_uspace.s_table) {
+ udf_debug("cannot load unallocSpaceTable (part %d)\n",
+ p_index);
+ return 1;
+ }
+ map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
+ udf_debug("unallocSpaceTable (part %d) @ %ld\n",
+ p_index, map->s_uspace.s_table->i_ino);
+ }
+
+ if (phd->unallocSpaceBitmap.extLength) {
+ struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
+ if (!bitmap)
+ return 1;
+ map->s_uspace.s_bitmap = bitmap;
+ bitmap->s_extLength = le32_to_cpu(
+ phd->unallocSpaceBitmap.extLength);
+ bitmap->s_extPosition = le32_to_cpu(
+ phd->unallocSpaceBitmap.extPosition);
+ map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
+ udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index,
+ bitmap->s_extPosition);
+ }
+
+ if (phd->partitionIntegrityTable.extLength)
+ udf_debug("partitionIntegrityTable (part %d)\n", p_index);
+
+ if (phd->freedSpaceTable.extLength) {
+ kernel_lb_addr loc = {
+ .logicalBlockNum = le32_to_cpu(
+ phd->freedSpaceTable.extPosition),
+ .partitionReferenceNum = p_index,
+ };
+
+ map->s_fspace.s_table = udf_iget(sb, loc);
+ if (!map->s_fspace.s_table) {
+ udf_debug("cannot load freedSpaceTable (part %d)\n",
+ p_index);
+ return 1;
+ }
+
+ map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
+ udf_debug("freedSpaceTable (part %d) @ %ld\n",
+ p_index, map->s_fspace.s_table->i_ino);
+ }
+
+ if (phd->freedSpaceBitmap.extLength) {
+ struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
+ if (!bitmap)
+ return 1;
+ map->s_fspace.s_bitmap = bitmap;
+ bitmap->s_extLength = le32_to_cpu(
+ phd->freedSpaceBitmap.extLength);
+ bitmap->s_extPosition = le32_to_cpu(
+ phd->freedSpaceBitmap.extPosition);
+ map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
+ udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index,
+ bitmap->s_extPosition);
+ }
+ return 0;
+}
+
+static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+{
+ struct udf_sb_info *sbi = UDF_SB(sb);
+ struct udf_part_map *map = &sbi->s_partmaps[p_index];
+ kernel_lb_addr ino;
+ struct buffer_head *bh = NULL;
+ struct udf_inode_info *vati;
+ uint32_t pos;
+ struct virtualAllocationTable20 *vat20;
+
+ /* VAT file entry is in the last recorded block */
+ ino.partitionReferenceNum = type1_index;
+ ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
+ sbi->s_vat_inode = udf_iget(sb, ino);
+ if (!sbi->s_vat_inode)
+ return 1;
+
+ if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
+ map->s_type_specific.s_virtual.s_start_offset = 0;
+ map->s_type_specific.s_virtual.s_num_entries =
+ (sbi->s_vat_inode->i_size - 36) >> 2;
+ } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
+ vati = UDF_I(sbi->s_vat_inode);
+ if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+ pos = udf_block_map(sbi->s_vat_inode, 0);
+ bh = sb_bread(sb, pos);
+ if (!bh)
+ return 1;
+ vat20 = (struct virtualAllocationTable20 *)bh->b_data;
+ } else {
+ vat20 = (struct virtualAllocationTable20 *)
+ vati->i_ext.i_data;
+ }
+
+ map->s_type_specific.s_virtual.s_start_offset =
+ le16_to_cpu(vat20->lengthHeader);
+ map->s_type_specific.s_virtual.s_num_entries =
+ (sbi->s_vat_inode->i_size -
+ map->s_type_specific.s_virtual.
+ s_start_offset) >> 2;
+ brelse(bh);
+ }
+ return 0;
+}
+
+static int udf_load_partdesc(struct super_block *sb, sector_t block)
{
+ struct buffer_head *bh;
struct partitionDesc *p;
- int i;
struct udf_part_map *map;
- struct udf_sb_info *sbi;
+ struct udf_sb_info *sbi = UDF_SB(sb);
+ int i, type1_idx;
+ uint16_t partitionNumber;
+ uint16_t ident;
+ int ret = 0;
+
+ bh = udf_read_tagged(sb, block, block, &ident);
+ if (!bh)
+ return 1;
+ if (ident != TAG_IDENT_PD)
+ goto out_bh;
p = (struct partitionDesc *)bh->b_data;
- sbi = UDF_SB(sb);
+ partitionNumber = le16_to_cpu(p->partitionNumber);
+ /* First scan for TYPE1, SPARABLE and METADATA partitions */
for (i = 0; i < sbi->s_partitions; i++) {
map = &sbi->s_partmaps[i];
udf_debug("Searching map: (%d == %d)\n",
- map->s_partition_num,
- le16_to_cpu(p->partitionNumber));
- if (map->s_partition_num ==
- le16_to_cpu(p->partitionNumber)) {
- map->s_partition_len =
- le32_to_cpu(p->partitionLength); /* blocks */
- map->s_partition_root =
- le32_to_cpu(p->partitionStartingLocation);
- if (p->accessType ==
- cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
- map->s_partition_flags |=
- UDF_PART_FLAG_READ_ONLY;
- if (p->accessType ==
- cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE))
- map->s_partition_flags |=
- UDF_PART_FLAG_WRITE_ONCE;
- if (p->accessType ==
- cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE))
- map->s_partition_flags |=
- UDF_PART_FLAG_REWRITABLE;
- if (p->accessType ==
- cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
- map->s_partition_flags |=
- UDF_PART_FLAG_OVERWRITABLE;
-
- if (!strcmp(p->partitionContents.ident,
- PD_PARTITION_CONTENTS_NSR02) ||
- !strcmp(p->partitionContents.ident,
- PD_PARTITION_CONTENTS_NSR03)) {
- struct partitionHeaderDesc *phd;
-
- phd = (struct partitionHeaderDesc *)
- (p->partitionContentsUse);
- if (phd->unallocSpaceTable.extLength) {
- kernel_lb_addr loc = {
- .logicalBlockNum = le32_to_cpu(phd->unallocSpaceTable.extPosition),
- .partitionReferenceNum = i,
- };
-
- map->s_uspace.s_table =
- udf_iget(sb, loc);
- if (!map->s_uspace.s_table) {
- udf_debug("cannot load unallocSpaceTable (part %d)\n", i);
- return 1;
- }
- map->s_partition_flags |=
- UDF_PART_FLAG_UNALLOC_TABLE;
- udf_debug("unallocSpaceTable (part %d) @ %ld\n",
- i, map->s_uspace.s_table->i_ino);
- }
- if (phd->unallocSpaceBitmap.extLength) {
- struct udf_bitmap *bitmap =
- udf_sb_alloc_bitmap(sb, i);
- map->s_uspace.s_bitmap = bitmap;
- if (bitmap != NULL) {
- bitmap->s_extLength =
- le32_to_cpu(phd->unallocSpaceBitmap.extLength);
- bitmap->s_extPosition =
- le32_to_cpu(phd->unallocSpaceBitmap.extPosition);
- map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
- udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
- i, bitmap->s_extPosition);
- }
- }
- if (phd->partitionIntegrityTable.extLength)
- udf_debug("partitionIntegrityTable (part %d)\n", i);
- if (phd->freedSpaceTable.extLength) {
- kernel_lb_addr loc = {
- .logicalBlockNum = le32_to_cpu(phd->freedSpaceTable.extPosition),
- .partitionReferenceNum = i,
- };
-
- map->s_fspace.s_table =
- udf_iget(sb, loc);
- if (!map->s_fspace.s_table) {
- udf_debug("cannot load freedSpaceTable (part %d)\n", i);
- return 1;
- }
- map->s_partition_flags |=
- UDF_PART_FLAG_FREED_TABLE;
- udf_debug("freedSpaceTable (part %d) @ %ld\n",
- i, map->s_fspace.s_table->i_ino);
- }
- if (phd->freedSpaceBitmap.extLength) {
- struct udf_bitmap *bitmap =
- udf_sb_alloc_bitmap(sb, i);
- map->s_fspace.s_bitmap = bitmap;
- if (bitmap != NULL) {
- bitmap->s_extLength =
- le32_to_cpu(phd->freedSpaceBitmap.extLength);
- bitmap->s_extPosition =
- le32_to_cpu(phd->freedSpaceBitmap.extPosition);
- map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
- udf_debug("freedSpaceBitmap (part %d) @ %d\n",
- i, bitmap->s_extPosition);
- }
- }
- }
+ map->s_partition_num, partitionNumber);
+ if (map->s_partition_num == partitionNumber &&
+ (map->s_partition_type == UDF_TYPE1_MAP15 ||
+ map->s_partition_type == UDF_SPARABLE_MAP15))
break;
- }
}
- if (i == sbi->s_partitions)
+
+ if (i >= sbi->s_partitions) {
udf_debug("Partition (%d) not found in partition map\n",
- le16_to_cpu(p->partitionNumber));
- else
- udf_debug("Partition (%d:%d type %x) starts at physical %d, "
- "block length %d\n",
- le16_to_cpu(p->partitionNumber), i,
- map->s_partition_type,
- map->s_partition_root,
- map->s_partition_len);
- return 0;
+ partitionNumber);
+ goto out_bh;
+ }
+
+ ret = udf_fill_partdesc_info(sb, p, i);
+
+ /*
+ * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and
+ * PHYSICAL partitions are already set up
+ */
+ type1_idx = i;
+ for (i = 0; i < sbi->s_partitions; i++) {
+ map = &sbi->s_partmaps[i];
+
+ if (map->s_partition_num == partitionNumber &&
+ (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
+ map->s_partition_type == UDF_VIRTUAL_MAP20 ||
+ map->s_partition_type == UDF_METADATA_MAP25))
+ break;
+ }
+
+ if (i >= sbi->s_partitions)
+ goto out_bh;
+
+ ret = udf_fill_partdesc_info(sb, p, i);
+ if (ret)
+ goto out_bh;
+
+ if (map->s_partition_type == UDF_METADATA_MAP25) {
+ ret = udf_load_metadata_files(sb, i);
+ if (ret) {
+ printk(KERN_ERR "UDF-fs: error loading MetaData "
+ "partition map %d\n", i);
+ goto out_bh;
+ }
+ } else {
+ ret = udf_load_vat(sb, i, type1_idx);
+ if (ret)
+ goto out_bh;
+ /*
+ * Mark filesystem read-only if we have a partition with
+ * virtual map since we don't handle writing to it (we
+ * overwrite blocks instead of relocating them).
+ */
+ sb->s_flags |= MS_RDONLY;
+ printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only "
+ "because writing to pseudooverwrite partition is "
+ "not implemented.\n");
+ }
+out_bh:
+ /* In case loading failed, we handle cleanup in udf_fill_super */
+ brelse(bh);
+ return ret;
}
-static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
+static int udf_load_logicalvol(struct super_block *sb, sector_t block,
kernel_lb_addr *fileset)
{
struct logicalVolDesc *lvd;
@@ -1194,12 +1340,21 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
uint8_t type;
struct udf_sb_info *sbi = UDF_SB(sb);
struct genericPartitionMap *gpm;
+ uint16_t ident;
+ struct buffer_head *bh;
+ int ret = 0;
+ bh = udf_read_tagged(sb, block, block, &ident);
+ if (!bh)
+ return 1;
+ BUG_ON(ident != TAG_IDENT_LVD);
lvd = (struct logicalVolDesc *)bh->b_data;
i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps));
- if (i != 0)
- return i;
+ if (i != 0) {
+ ret = i;
+ goto out_bh;
+ }
for (i = 0, offset = 0;
i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength);
@@ -1223,12 +1378,12 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
u16 suf =
le16_to_cpu(((__le16 *)upm2->partIdent.
identSuffix)[0]);
- if (suf == 0x0150) {
+ if (suf < 0x0200) {
map->s_partition_type =
UDF_VIRTUAL_MAP15;
map->s_partition_func =
udf_get_pblock_virt15;
- } else if (suf == 0x0200) {
+ } else {
map->s_partition_type =
UDF_VIRTUAL_MAP20;
map->s_partition_func =
@@ -1238,7 +1393,6 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
UDF_ID_SPARABLE,
strlen(UDF_ID_SPARABLE))) {
uint32_t loc;
- uint16_t ident;
struct sparingTable *st;
struct sparablePartitionMap *spm =
(struct sparablePartitionMap *)gpm;
@@ -1256,22 +1410,64 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
map->s_type_specific.s_sparing.
s_spar_map[j] = bh2;
- if (bh2 != NULL) {
- st = (struct sparingTable *)
- bh2->b_data;
- if (ident != 0 || strncmp(
- st->sparingIdent.ident,
- UDF_ID_SPARING,
- strlen(UDF_ID_SPARING))) {
- brelse(bh2);
- map->s_type_specific.
- s_sparing.
- s_spar_map[j] =
- NULL;
- }
+ if (bh2 == NULL)
+ continue;
+
+ st = (struct sparingTable *)bh2->b_data;
+ if (ident != 0 || strncmp(
+ st->sparingIdent.ident,
+ UDF_ID_SPARING,
+ strlen(UDF_ID_SPARING))) {
+ brelse(bh2);
+ map->s_type_specific.s_sparing.
+ s_spar_map[j] = NULL;
}
}
map->s_partition_func = udf_get_pblock_spar15;
+ } else if (!strncmp(upm2->partIdent.ident,
+ UDF_ID_METADATA,
+ strlen(UDF_ID_METADATA))) {
+ struct udf_meta_data *mdata =
+ &map->s_type_specific.s_metadata;
+ struct metadataPartitionMap *mdm =
+ (struct metadataPartitionMap *)
+ &(lvd->partitionMaps[offset]);
+ udf_debug("Parsing Logical vol part %d "
+ "type %d id=%s\n", i, type,
+ UDF_ID_METADATA);
+
+ map->s_partition_type = UDF_METADATA_MAP25;
+ map->s_partition_func = udf_get_pblock_meta25;
+
+ mdata->s_meta_file_loc =
+ le32_to_cpu(mdm->metadataFileLoc);
+ mdata->s_mirror_file_loc =
+ le32_to_cpu(mdm->metadataMirrorFileLoc);
+ mdata->s_bitmap_file_loc =
+ le32_to_cpu(mdm->metadataBitmapFileLoc);
+ mdata->s_alloc_unit_size =
+ le32_to_cpu(mdm->allocUnitSize);
+ mdata->s_align_unit_size =
+ le16_to_cpu(mdm->alignUnitSize);
+ mdata->s_dup_md_flag =
+ mdm->flags & 0x01;
+
+ udf_debug("Metadata Ident suffix=0x%x\n",
+ (le16_to_cpu(
+ ((__le16 *)
+ mdm->partIdent.identSuffix)[0])));
+ udf_debug("Metadata part num=%d\n",
+ le16_to_cpu(mdm->partitionNum));
+ udf_debug("Metadata part alloc unit size=%d\n",
+ le32_to_cpu(mdm->allocUnitSize));
+ udf_debug("Metadata file loc=%d\n",
+ le32_to_cpu(mdm->metadataFileLoc));
+ udf_debug("Mirror file loc=%d\n",
+ le32_to_cpu(mdm->metadataMirrorFileLoc));
+ udf_debug("Bitmap file loc=%d\n",
+ le32_to_cpu(mdm->metadataBitmapFileLoc));
+ udf_debug("Duplicate Flag: %d %d\n",
+ mdata->s_dup_md_flag, mdm->flags);
} else {
udf_debug("Unknown ident: %s\n",
upm2->partIdent.ident);
@@ -1296,7 +1492,9 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
if (lvd->integritySeqExt.extLength)
udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
- return 0;
+out_bh:
+ brelse(bh);
+ return ret;
}
/*
@@ -1345,7 +1543,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
* July 1, 1997 - Andrew E. Mileski
* Written, tested, and released.
*/
-static int udf_process_sequence(struct super_block *sb, long block,
+static noinline int udf_process_sequence(struct super_block *sb, long block,
long lastblock, kernel_lb_addr *fileset)
{
struct buffer_head *bh = NULL;
@@ -1354,19 +1552,25 @@ static int udf_process_sequence(struct super_block *sb, long block,
struct generic_desc *gd;
struct volDescPtr *vdp;
int done = 0;
- int i, j;
uint32_t vdsn;
uint16_t ident;
long next_s = 0, next_e = 0;
memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
- /* Read the main descriptor sequence */
+ /*
+ * Read the main descriptor sequence and find which descriptors
+ * are in it.
+ */
for (; (!done && block <= lastblock); block++) {
bh = udf_read_tagged(sb, block, block, &ident);
- if (!bh)
- break;
+ if (!bh) {
+ printk(KERN_ERR "udf: Block %Lu of volume descriptor "
+ "sequence is corrupted or we could not read "
+ "it.\n", (unsigned long long)block);
+ return 1;
+ }
/* Process each descriptor (ISO 13346 3/8.3-8.4) */
gd = (struct generic_desc *)bh->b_data;
@@ -1432,41 +1636,31 @@ static int udf_process_sequence(struct super_block *sb, long block,
}
brelse(bh);
}
- for (i = 0; i < VDS_POS_LENGTH; i++) {
- if (vds[i].block) {
- bh = udf_read_tagged(sb, vds[i].block, vds[i].block,
- &ident);
-
- if (i == VDS_POS_PRIMARY_VOL_DESC) {
- udf_load_pvoldesc(sb, bh);
- } else if (i == VDS_POS_LOGICAL_VOL_DESC) {
- if (udf_load_logicalvol(sb, bh, fileset)) {
- brelse(bh);
- return 1;
- }
- } else if (i == VDS_POS_PARTITION_DESC) {
- struct buffer_head *bh2 = NULL;
- if (udf_load_partdesc(sb, bh)) {
- brelse(bh);
- return 1;
- }
- for (j = vds[i].block + 1;
- j < vds[VDS_POS_TERMINATING_DESC].block;
- j++) {
- bh2 = udf_read_tagged(sb, j, j, &ident);
- gd = (struct generic_desc *)bh2->b_data;
- if (ident == TAG_IDENT_PD)
- if (udf_load_partdesc(sb,
- bh2)) {
- brelse(bh);
- brelse(bh2);
- return 1;
- }
- brelse(bh2);
- }
- }
- brelse(bh);
- }
+ /*
+ * Now read interesting descriptors again and process them
+ * in a suitable order
+ */
+ if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
+ printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n");
+ return 1;
+ }
+ if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
+ return 1;
+
+ if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb,
+ vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset))
+ return 1;
+
+ if (vds[VDS_POS_PARTITION_DESC].block) {
+ /*
+ * We rescan the whole descriptor sequence to find
+ * partition descriptor blocks and process them.
+ */
+ for (block = vds[VDS_POS_PARTITION_DESC].block;
+ block < vds[VDS_POS_TERMINATING_DESC].block;
+ block++)
+ if (udf_load_partdesc(sb, block))
+ return 1;
}
return 0;
@@ -1478,6 +1672,7 @@ static int udf_process_sequence(struct super_block *sb, long block,
static int udf_check_valid(struct super_block *sb, int novrs, int silent)
{
long block;
+ struct udf_sb_info *sbi = UDF_SB(sb);
if (novrs) {
udf_debug("Validity check skipped because of novrs option\n");
@@ -1485,27 +1680,22 @@ static int udf_check_valid(struct super_block *sb, int novrs, int silent)
}
/* Check that it is NSR02 compliant */
/* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
- else {
- block = udf_vrs(sb, silent);
- if (block == -1) {
- struct udf_sb_info *sbi = UDF_SB(sb);
- udf_debug("Failed to read byte 32768. Assuming open "
- "disc. Skipping validity check\n");
- if (!sbi->s_last_block)
- sbi->s_last_block = udf_get_last_block(sb);
- return 0;
- } else
- return !block;
- }
+ block = udf_vrs(sb, silent);
+ if (block == -1)
+ udf_debug("Failed to read byte 32768. Assuming open "
+ "disc. Skipping validity check\n");
+ if (block && !sbi->s_last_block)
+ sbi->s_last_block = udf_get_last_block(sb);
+ return !block;
}
-static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
+static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset)
{
struct anchorVolDescPtr *anchor;
uint16_t ident;
struct buffer_head *bh;
long main_s, main_e, reserve_s, reserve_e;
- int i, j;
+ int i;
struct udf_sb_info *sbi;
if (!sb)
@@ -1515,6 +1705,7 @@ static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
if (!sbi->s_anchor[i])
continue;
+
bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i],
&ident);
if (!bh)
@@ -1553,76 +1744,6 @@ static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
}
udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
- for (i = 0; i < sbi->s_partitions; i++) {
- kernel_lb_addr uninitialized_var(ino);
- struct udf_part_map *map = &sbi->s_partmaps[i];
- switch (map->s_partition_type) {
- case UDF_VIRTUAL_MAP15:
- case UDF_VIRTUAL_MAP20:
- if (!sbi->s_last_block) {
- sbi->s_last_block = udf_get_last_block(sb);
- udf_find_anchor(sb);
- }
-
- if (!sbi->s_last_block) {
- udf_debug("Unable to determine Lastblock (For "
- "Virtual Partition)\n");
- return 1;
- }
-
- for (j = 0; j < sbi->s_partitions; j++) {
- struct udf_part_map *map2 = &sbi->s_partmaps[j];
- if (j != i &&
- map->s_volumeseqnum ==
- map2->s_volumeseqnum &&
- map->s_partition_num ==
- map2->s_partition_num) {
- ino.partitionReferenceNum = j;
- ino.logicalBlockNum =
- sbi->s_last_block -
- map2->s_partition_root;
- break;
- }
- }
-
- if (j == sbi->s_partitions)
- return 1;
-
- sbi->s_vat_inode = udf_iget(sb, ino);
- if (!sbi->s_vat_inode)
- return 1;
-
- if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
- map->s_type_specific.s_virtual.s_start_offset =
- udf_ext0_offset(sbi->s_vat_inode);
- map->s_type_specific.s_virtual.s_num_entries =
- (sbi->s_vat_inode->i_size - 36) >> 2;
- } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
- uint32_t pos;
- struct virtualAllocationTable20 *vat20;
-
- pos = udf_block_map(sbi->s_vat_inode, 0);
- bh = sb_bread(sb, pos);
- if (!bh)
- return 1;
- vat20 = (struct virtualAllocationTable20 *)
- bh->b_data +
- udf_ext0_offset(sbi->s_vat_inode);
- map->s_type_specific.s_virtual.s_start_offset =
- le16_to_cpu(vat20->lengthHeader) +
- udf_ext0_offset(sbi->s_vat_inode);
- map->s_type_specific.s_virtual.s_num_entries =
- (sbi->s_vat_inode->i_size -
- map->s_type_specific.s_virtual.
- s_start_offset) >> 2;
- brelse(bh);
- }
- map->s_partition_root = udf_get_pblock(sb, 0, i, 0);
- map->s_partition_len =
- sbi->s_partmaps[ino.partitionReferenceNum].
- s_partition_len;
- }
- }
return 0;
}
@@ -1630,65 +1751,61 @@ static void udf_open_lvid(struct super_block *sb)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct buffer_head *bh = sbi->s_lvid_bh;
- if (bh) {
- kernel_timestamp cpu_time;
- struct logicalVolIntegrityDesc *lvid =
- (struct logicalVolIntegrityDesc *)bh->b_data;
- struct logicalVolIntegrityDescImpUse *lvidiu =
- udf_sb_lvidiu(sbi);
+ struct logicalVolIntegrityDesc *lvid;
+ struct logicalVolIntegrityDescImpUse *lvidiu;
+ if (!bh)
+ return;
- lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
- lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- if (udf_time_to_stamp(&cpu_time, CURRENT_TIME))
- lvid->recordingDateAndTime = cpu_to_lets(cpu_time);
- lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
+ lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+ lvidiu = udf_sb_lvidiu(sbi);
- lvid->descTag.descCRC = cpu_to_le16(
- udf_crc((char *)lvid + sizeof(tag),
- le16_to_cpu(lvid->descTag.descCRCLength),
- 0));
+ lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+ lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+ udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
+ CURRENT_TIME);
+ lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
- mark_buffer_dirty(bh);
- }
+ lvid->descTag.descCRC = cpu_to_le16(
+ crc_itu_t(0, (char *)lvid + sizeof(tag),
+ le16_to_cpu(lvid->descTag.descCRCLength)));
+
+ lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+ mark_buffer_dirty(bh);
}
static void udf_close_lvid(struct super_block *sb)
{
- kernel_timestamp cpu_time;
struct udf_sb_info *sbi = UDF_SB(sb);
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
+ struct logicalVolIntegrityDescImpUse *lvidiu;
if (!bh)
return;
lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
- if (lvid->integrityType == LVID_INTEGRITY_TYPE_OPEN) {
- struct logicalVolIntegrityDescImpUse *lvidiu =
- udf_sb_lvidiu(sbi);
- lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
- lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- if (udf_time_to_stamp(&cpu_time, CURRENT_TIME))
- lvid->recordingDateAndTime = cpu_to_lets(cpu_time);
- if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
- lvidiu->maxUDFWriteRev =
- cpu_to_le16(UDF_MAX_WRITE_VERSION);
- if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
- lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev);
- if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev))
- lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev);
- lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
-
- lvid->descTag.descCRC = cpu_to_le16(
- udf_crc((char *)lvid + sizeof(tag),
- le16_to_cpu(lvid->descTag.descCRCLength),
- 0));
-
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
- mark_buffer_dirty(bh);
- }
+ if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
+ return;
+
+ lvidiu = udf_sb_lvidiu(sbi);
+ lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+ lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+ udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME);
+ if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
+ lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
+ if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
+ lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev);
+ if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev))
+ lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev);
+ lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
+
+ lvid->descTag.descCRC = cpu_to_le16(
+ crc_itu_t(0, (char *)lvid + sizeof(tag),
+ le16_to_cpu(lvid->descTag.descCRCLength)));
+
+ lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+ mark_buffer_dirty(bh);
}
static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1708,22 +1825,35 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
vfree(bitmap);
}
-/*
- * udf_read_super
- *
- * PURPOSE
- * Complete the specified super block.
- *
- * PRE-CONDITIONS
- * sb Pointer to superblock to complete - never NULL.
- * sb->s_dev Device to read suberblock from.
- * options Pointer to mount options.
- * silent Silent flag.
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
+static void udf_free_partition(struct udf_part_map *map)
+{
+ int i;
+ struct udf_meta_data *mdata;
+
+ if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
+ iput(map->s_uspace.s_table);
+ if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
+ iput(map->s_fspace.s_table);
+ if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
+ udf_sb_free_bitmap(map->s_uspace.s_bitmap);
+ if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+ udf_sb_free_bitmap(map->s_fspace.s_bitmap);
+ if (map->s_partition_type == UDF_SPARABLE_MAP15)
+ for (i = 0; i < 4; i++)
+ brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
+ else if (map->s_partition_type == UDF_METADATA_MAP25) {
+ mdata = &map->s_type_specific.s_metadata;
+ iput(mdata->s_metadata_fe);
+ mdata->s_metadata_fe = NULL;
+
+ iput(mdata->s_mirror_fe);
+ mdata->s_mirror_fe = NULL;
+
+ iput(mdata->s_bitmap_fe);
+ mdata->s_bitmap_fe = NULL;
+ }
+}
+
static int udf_fill_super(struct super_block *sb, void *options, int silent)
{
int i;
@@ -1776,8 +1906,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
sbi->s_nls_map = uopt.nls_map;
/* Set the block size for all transfers */
- if (!udf_set_blocksize(sb, uopt.blocksize))
+ if (!sb_min_blocksize(sb, uopt.blocksize)) {
+ udf_debug("Bad block size (%d)\n", uopt.blocksize);
+ printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
goto error_out;
+ }
if (uopt.session == 0xFFFFFFFF)
sbi->s_session = udf_get_last_session(sb);
@@ -1789,7 +1922,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
sbi->s_last_block = uopt.lastblock;
sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
sbi->s_anchor[2] = uopt.anchor;
- sbi->s_anchor[3] = 256;
if (udf_check_valid(sb, uopt.novrs, silent)) {
/* read volume recognition sequences */
@@ -1806,7 +1938,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
sb->s_magic = UDF_SUPER_MAGIC;
sb->s_time_gran = 1000;
- if (udf_load_partition(sb, &fileset)) {
+ if (udf_load_sequence(sb, &fileset)) {
printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
goto error_out;
}
@@ -1856,12 +1988,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
}
if (!silent) {
- kernel_timestamp ts;
- udf_time_to_stamp(&ts, sbi->s_record_time);
+ timestamp ts;
+ udf_time_to_disk_stamp(&ts, sbi->s_record_time);
udf_info("UDF: Mounting volume '%s', "
"timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
- sbi->s_volume_ident, ts.year, ts.month, ts.day,
- ts.hour, ts.minute, ts.typeAndTimezone);
+ sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day,
+ ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
}
if (!(sb->s_flags & MS_RDONLY))
udf_open_lvid(sb);
@@ -1890,21 +2022,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
error_out:
if (sbi->s_vat_inode)
iput(sbi->s_vat_inode);
- if (sbi->s_partitions) {
- struct udf_part_map *map = &sbi->s_partmaps[sbi->s_partition];
- if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
- iput(map->s_uspace.s_table);
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- iput(map->s_fspace.s_table);
- if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
- udf_sb_free_bitmap(map->s_uspace.s_bitmap);
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- udf_sb_free_bitmap(map->s_fspace.s_bitmap);
- if (map->s_partition_type == UDF_SPARABLE_MAP15)
- for (i = 0; i < 4; i++)
- brelse(map->s_type_specific.s_sparing.
- s_spar_map[i]);
- }
+ if (sbi->s_partitions)
+ for (i = 0; i < sbi->s_partitions; i++)
+ udf_free_partition(&sbi->s_partmaps[i]);
#ifdef CONFIG_UDF_NLS
if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
unload_nls(sbi->s_nls_map);
@@ -1920,8 +2040,8 @@ error_out:
return -EINVAL;
}
-void udf_error(struct super_block *sb, const char *function,
- const char *fmt, ...)
+static void udf_error(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
va_list args;
@@ -1948,19 +2068,6 @@ void udf_warning(struct super_block *sb, const char *function,
sb->s_id, function, error_buf);
}
-/*
- * udf_put_super
- *
- * PURPOSE
- * Prepare for destruction of the superblock.
- *
- * DESCRIPTION
- * Called before the filesystem is unmounted.
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
static void udf_put_super(struct super_block *sb)
{
int i;
@@ -1969,21 +2076,9 @@ static void udf_put_super(struct super_block *sb)
sbi = UDF_SB(sb);
if (sbi->s_vat_inode)
iput(sbi->s_vat_inode);
- if (sbi->s_partitions) {
- struct udf_part_map *map = &sbi->s_partmaps[sbi->s_partition];
- if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
- iput(map->s_uspace.s_table);
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- iput(map->s_fspace.s_table);
- if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
- udf_sb_free_bitmap(map->s_uspace.s_bitmap);
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- udf_sb_free_bitmap(map->s_fspace.s_bitmap);
- if (map->s_partition_type == UDF_SPARABLE_MAP15)
- for (i = 0; i < 4; i++)
- brelse(map->s_type_specific.s_sparing.
- s_spar_map[i]);
- }
+ if (sbi->s_partitions)
+ for (i = 0; i < sbi->s_partitions; i++)
+ udf_free_partition(&sbi->s_partmaps[i]);
#ifdef CONFIG_UDF_NLS
if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
unload_nls(sbi->s_nls_map);
@@ -1996,19 +2091,6 @@ static void udf_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
-/*
- * udf_stat_fs
- *
- * PURPOSE
- * Return info about the filesystem.
- *
- * DESCRIPTION
- * Called by sys_statfs()
- *
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -2035,10 +2117,6 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static unsigned char udf_bitmap_lookup[16] = {
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
-};
-
static unsigned int udf_count_free_bitmap(struct super_block *sb,
struct udf_bitmap *bitmap)
{
@@ -2048,7 +2126,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
int block = 0, newblock;
kernel_lb_addr loc;
uint32_t bytes;
- uint8_t value;
uint8_t *ptr;
uint16_t ident;
struct spaceBitmapDesc *bm;
@@ -2074,13 +2151,10 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
ptr = (uint8_t *)bh->b_data;
while (bytes > 0) {
- while ((bytes > 0) && (index < sb->s_blocksize)) {
- value = ptr[index];
- accum += udf_bitmap_lookup[value & 0x0f];
- accum += udf_bitmap_lookup[value >> 4];
- index++;
- bytes--;
- }
+ u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index);
+ accum += bitmap_weight((const unsigned long *)(ptr + index),
+ cur_bytes * 8);
+ bytes -= cur_bytes;
if (bytes) {
brelse(bh);
newblock = udf_get_lb_pblock(sb, loc, ++block);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 6ec99221e50..c3265e1385d 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -23,7 +23,6 @@
#include <asm/uaccess.h>
#include <linux/errno.h>
#include <linux/fs.h>
-#include <linux/udf_fs.h>
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/stat.h>
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index fe61be17cda..65e19b4f942 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -22,7 +22,6 @@
#include "udfdecl.h"
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/udf_fs.h>
#include <linux/buffer_head.h>
#include "udf_i.h"
@@ -180,6 +179,24 @@ void udf_discard_prealloc(struct inode *inode)
brelse(epos.bh);
}
+static void udf_update_alloc_ext_desc(struct inode *inode,
+ struct extent_position *epos,
+ u32 lenalloc)
+{
+ struct super_block *sb = inode->i_sb;
+ struct udf_sb_info *sbi = UDF_SB(sb);
+
+ struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data);
+ int len = sizeof(struct allocExtDesc);
+
+ aed->lengthAllocDescs = cpu_to_le32(lenalloc);
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201)
+ len += lenalloc;
+
+ udf_update_tag(epos->bh->b_data, len);
+ mark_buffer_dirty_inode(epos->bh, inode);
+}
+
void udf_truncate_extents(struct inode *inode)
{
struct extent_position epos;
@@ -187,7 +204,6 @@ void udf_truncate_extents(struct inode *inode)
uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
int8_t etype;
struct super_block *sb = inode->i_sb;
- struct udf_sb_info *sbi = UDF_SB(sb);
sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset;
loff_t byte_offset;
int adsize;
@@ -224,35 +240,15 @@ void udf_truncate_extents(struct inode *inode)
if (indirect_ext_len) {
/* We managed to free all extents in the
* indirect extent - free it too */
- if (!epos.bh)
- BUG();
+ BUG_ON(!epos.bh);
udf_free_blocks(sb, inode, epos.block,
0, indirect_ext_len);
- } else {
- if (!epos.bh) {
- iinfo->i_lenAlloc =
- lenalloc;
- mark_inode_dirty(inode);
- } else {
- struct allocExtDesc *aed =
- (struct allocExtDesc *)
- (epos.bh->b_data);
- int len =
- sizeof(struct allocExtDesc);
-
- aed->lengthAllocDescs =
- cpu_to_le32(lenalloc);
- if (!UDF_QUERY_FLAG(sb,
- UDF_FLAG_STRICT) ||
- sbi->s_udfrev >= 0x0201)
- len += lenalloc;
-
- udf_update_tag(epos.bh->b_data,
- len);
- mark_buffer_dirty_inode(
- epos.bh, inode);
- }
- }
+ } else if (!epos.bh) {
+ iinfo->i_lenAlloc = lenalloc;
+ mark_inode_dirty(inode);
+ } else
+ udf_update_alloc_ext_desc(inode,
+ &epos, lenalloc);
brelse(epos.bh);
epos.offset = sizeof(struct allocExtDesc);
epos.block = eloc;
@@ -272,29 +268,14 @@ void udf_truncate_extents(struct inode *inode)
}
if (indirect_ext_len) {
- if (!epos.bh)
- BUG();
+ BUG_ON(!epos.bh);
udf_free_blocks(sb, inode, epos.block, 0,
indirect_ext_len);
- } else {
- if (!epos.bh) {
- iinfo->i_lenAlloc = lenalloc;
- mark_inode_dirty(inode);
- } else {
- struct allocExtDesc *aed =
- (struct allocExtDesc *)(epos.bh->b_data);
- aed->lengthAllocDescs = cpu_to_le32(lenalloc);
- if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) ||
- sbi->s_udfrev >= 0x0201)
- udf_update_tag(epos.bh->b_data,
- lenalloc +
- sizeof(struct allocExtDesc));
- else
- udf_update_tag(epos.bh->b_data,
- sizeof(struct allocExtDesc));
- mark_buffer_dirty_inode(epos.bh, inode);
- }
- }
+ } else if (!epos.bh) {
+ iinfo->i_lenAlloc = lenalloc;
+ mark_inode_dirty(inode);
+ } else
+ udf_update_alloc_ext_desc(inode, &epos, lenalloc);
} else if (inode->i_size) {
if (byte_offset) {
kernel_long_ad extent;
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index ccc52f16bf7..4f86b1d98a5 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,10 +1,32 @@
-#ifndef __LINUX_UDF_I_H
-#define __LINUX_UDF_I_H
+#ifndef _UDF_I_H
+#define _UDF_I_H
+
+struct udf_inode_info {
+ struct timespec i_crtime;
+ /* Physical address of inode */
+ kernel_lb_addr i_location;
+ __u64 i_unique;
+ __u32 i_lenEAttr;
+ __u32 i_lenAlloc;
+ __u64 i_lenExtents;
+ __u32 i_next_alloc_block;
+ __u32 i_next_alloc_goal;
+ unsigned i_alloc_type : 3;
+ unsigned i_efe : 1; /* extendedFileEntry */
+ unsigned i_use : 1; /* unallocSpaceEntry */
+ unsigned i_strat4096 : 1;
+ unsigned reserved : 26;
+ union {
+ short_ad *i_sad;
+ long_ad *i_lad;
+ __u8 *i_data;
+ } i_ext;
+ struct inode vfs_inode;
+};
-#include <linux/udf_fs_i.h>
static inline struct udf_inode_info *UDF_I(struct inode *inode)
{
return list_entry(inode, struct udf_inode_info, vfs_inode);
}
-#endif /* !defined(_LINUX_UDF_I_H) */
+#endif /* _UDF_I_H) */
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 737d1c604ee..1c1c514a972 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -1,10 +1,12 @@
#ifndef __LINUX_UDF_SB_H
#define __LINUX_UDF_SB_H
+#include <linux/mutex.h>
+
/* Since UDF 2.01 is ISO 13346 based... */
#define UDF_SUPER_MAGIC 0x15013346
-#define UDF_MAX_READ_VERSION 0x0201
+#define UDF_MAX_READ_VERSION 0x0250
#define UDF_MAX_WRITE_VERSION 0x0201
#define UDF_FLAG_USE_EXTENDED_FE 0
@@ -38,6 +40,111 @@
#define UDF_PART_FLAG_REWRITABLE 0x0040
#define UDF_PART_FLAG_OVERWRITABLE 0x0080
+#define UDF_MAX_BLOCK_LOADED 8
+
+#define UDF_TYPE1_MAP15 0x1511U
+#define UDF_VIRTUAL_MAP15 0x1512U
+#define UDF_VIRTUAL_MAP20 0x2012U
+#define UDF_SPARABLE_MAP15 0x1522U
+#define UDF_METADATA_MAP25 0x2511U
+
+#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
+
+struct udf_meta_data {
+ __u32 s_meta_file_loc;
+ __u32 s_mirror_file_loc;
+ __u32 s_bitmap_file_loc;
+ __u32 s_alloc_unit_size;
+ __u16 s_align_unit_size;
+ __u8 s_dup_md_flag;
+ struct inode *s_metadata_fe;
+ struct inode *s_mirror_fe;
+ struct inode *s_bitmap_fe;
+};
+
+struct udf_sparing_data {
+ __u16 s_packet_len;
+ struct buffer_head *s_spar_map[4];
+};
+
+struct udf_virtual_data {
+ __u32 s_num_entries;
+ __u16 s_start_offset;
+};
+
+struct udf_bitmap {
+ __u32 s_extLength;
+ __u32 s_extPosition;
+ __u16 s_nr_groups;
+ struct buffer_head **s_block_bitmap;
+};
+
+struct udf_part_map {
+ union {
+ struct udf_bitmap *s_bitmap;
+ struct inode *s_table;
+ } s_uspace;
+ union {
+ struct udf_bitmap *s_bitmap;
+ struct inode *s_table;
+ } s_fspace;
+ __u32 s_partition_root;
+ __u32 s_partition_len;
+ __u16 s_partition_type;
+ __u16 s_partition_num;
+ union {
+ struct udf_sparing_data s_sparing;
+ struct udf_virtual_data s_virtual;
+ struct udf_meta_data s_metadata;
+ } s_type_specific;
+ __u32 (*s_partition_func)(struct super_block *, __u32, __u16, __u32);
+ __u16 s_volumeseqnum;
+ __u16 s_partition_flags;
+};
+
+#pragma pack()
+
+struct udf_sb_info {
+ struct udf_part_map *s_partmaps;
+ __u8 s_volume_ident[32];
+
+ /* Overall info */
+ __u16 s_partitions;
+ __u16 s_partition;
+
+ /* Sector headers */
+ __s32 s_session;
+ __u32 s_anchor[3];
+ __u32 s_last_block;
+
+ struct buffer_head *s_lvid_bh;
+
+ /* Default permissions */
+ mode_t s_umask;
+ gid_t s_gid;
+ uid_t s_uid;
+
+ /* Root Info */
+ struct timespec s_record_time;
+
+ /* Fileset Info */
+ __u16 s_serial_number;
+
+ /* highest UDF revision we have recorded to this media */
+ __u16 s_udfrev;
+
+ /* Miscellaneous flags */
+ __u32 s_flags;
+
+ /* Encoding info */
+ struct nls_table *s_nls_map;
+
+ /* VAT inode */
+ struct inode *s_vat_inode;
+
+ struct mutex s_alloc_mutex;
+};
+
static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
{
return sb->s_fs_info;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 681dc2b66cd..f3f45d02927 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -1,17 +1,37 @@
#ifndef __UDF_DECL_H
#define __UDF_DECL_H
-#include <linux/udf_fs.h>
#include "ecma_167.h"
#include "osta_udf.h"
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/udf_fs_i.h>
-#include <linux/udf_fs_sb.h>
#include <linux/buffer_head.h>
+#include <linux/udf_fs_i.h>
+#include "udf_sb.h"
#include "udfend.h"
+#include "udf_i.h"
+
+#define UDF_PREALLOCATE
+#define UDF_DEFAULT_PREALLOC_BLOCKS 8
+
+#define UDFFS_DEBUG
+
+#ifdef UDFFS_DEBUG
+#define udf_debug(f, a...) \
+do { \
+ printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \
+ __FILE__, __LINE__, __func__); \
+ printk(f, ##a); \
+} while (0)
+#else
+#define udf_debug(f, a...) /**/
+#endif
+
+#define udf_info(f, a...) \
+ printk(KERN_INFO "UDF-fs INFO " f, ##a);
+
#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
@@ -23,16 +43,24 @@
#define UDF_NAME_LEN 256
#define UDF_PATH_LEN 1023
-#define udf_file_entry_alloc_offset(inode)\
- (UDF_I(inode)->i_use ?\
- sizeof(struct unallocSpaceEntry) :\
- ((UDF_I(inode)->i_efe ?\
- sizeof(struct extendedFileEntry) :\
- sizeof(struct fileEntry)) + UDF_I(inode)->i_lenEAttr))
-
-#define udf_ext0_offset(inode)\
- (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ?\
- udf_file_entry_alloc_offset(inode) : 0)
+static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
+{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ if (iinfo->i_use)
+ return sizeof(struct unallocSpaceEntry);
+ else if (iinfo->i_efe)
+ return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr;
+ else
+ return sizeof(struct fileEntry) + iinfo->i_lenEAttr;
+}
+
+static inline size_t udf_ext0_offset(struct inode *inode)
+{
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ return udf_file_entry_alloc_offset(inode);
+ else
+ return 0;
+}
#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
@@ -83,7 +111,6 @@ struct extent_position {
};
/* super.c */
-extern void udf_error(struct super_block *, const char *, const char *, ...);
extern void udf_warning(struct super_block *, const char *, const char *, ...);
/* namei.c */
@@ -150,6 +177,8 @@ extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t,
uint32_t);
extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t,
uint32_t);
+extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
+ uint32_t);
extern int udf_relocate_blocks(struct super_block *, long, long *);
/* unicode.c */
@@ -157,7 +186,7 @@ extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
int);
extern int udf_build_ustr(struct ustr *, dstring *, int);
-extern int udf_CS0toUTF8(struct ustr *, struct ustr *);
+extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
/* ialloc.c */
extern void udf_free_inode(struct inode *);
@@ -191,11 +220,9 @@ extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
-/* crc.c */
-extern uint16_t udf_crc(uint8_t *, uint32_t, uint16_t);
-
/* udftime.c */
-extern time_t *udf_stamp_to_time(time_t *, long *, kernel_timestamp);
-extern kernel_timestamp *udf_time_to_stamp(kernel_timestamp *, struct timespec);
+extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
+ timestamp src);
+extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src);
#endif /* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index c4bd1203f85..489f52fb428 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -24,17 +24,6 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
return out;
}
-static inline kernel_timestamp lets_to_cpu(timestamp in)
-{
- kernel_timestamp out;
-
- memcpy(&out, &in, sizeof(timestamp));
- out.typeAndTimezone = le16_to_cpu(in.typeAndTimezone);
- out.year = le16_to_cpu(in.year);
-
- return out;
-}
-
static inline short_ad lesa_to_cpu(short_ad in)
{
short_ad out;
@@ -85,15 +74,4 @@ static inline kernel_extent_ad leea_to_cpu(extent_ad in)
return out;
}
-static inline timestamp cpu_to_lets(kernel_timestamp in)
-{
- timestamp out;
-
- memcpy(&out, &in, sizeof(timestamp));
- out.typeAndTimezone = cpu_to_le16(in.typeAndTimezone);
- out.year = cpu_to_le16(in.year);
-
- return out;
-}
-
#endif /* __UDF_ENDIAN_H */
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index ce595732ba6..5f811655c9b 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,39 +85,38 @@ extern struct timezone sys_tz;
#define SECS_PER_HOUR (60 * 60)
#define SECS_PER_DAY (SECS_PER_HOUR * 24)
-time_t *udf_stamp_to_time(time_t *dest, long *dest_usec, kernel_timestamp src)
+struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
{
int yday;
- uint8_t type = src.typeAndTimezone >> 12;
+ u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
+ u16 year = le16_to_cpu(src.year);
+ uint8_t type = typeAndTimezone >> 12;
int16_t offset;
if (type == 1) {
- offset = src.typeAndTimezone << 4;
+ offset = typeAndTimezone << 4;
/* sign extent offset */
offset = (offset >> 4);
if (offset == -2047) /* unspecified offset */
offset = 0;
- } else {
+ } else
offset = 0;
- }
- if ((src.year < EPOCH_YEAR) ||
- (src.year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) {
- *dest = -1;
- *dest_usec = -1;
+ if ((year < EPOCH_YEAR) ||
+ (year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) {
return NULL;
}
- *dest = year_seconds[src.year - EPOCH_YEAR];
- *dest -= offset * 60;
+ dest->tv_sec = year_seconds[year - EPOCH_YEAR];
+ dest->tv_sec -= offset * 60;
- yday = ((__mon_yday[__isleap(src.year)][src.month - 1]) + src.day - 1);
- *dest += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second;
- *dest_usec = src.centiseconds * 10000 +
- src.hundredsOfMicroseconds * 100 + src.microseconds;
+ yday = ((__mon_yday[__isleap(year)][src.month - 1]) + src.day - 1);
+ dest->tv_sec += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second;
+ dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
+ src.hundredsOfMicroseconds * 100 + src.microseconds);
return dest;
}
-kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts)
+timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts)
{
long int days, rem, y;
const unsigned short int *ip;
@@ -128,7 +127,7 @@ kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts)
if (!dest)
return NULL;
- dest->typeAndTimezone = 0x1000 | (offset & 0x0FFF);
+ dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF));
ts.tv_sec += offset * 60;
days = ts.tv_sec / SECS_PER_DAY;
@@ -151,7 +150,7 @@ kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts)
- LEAPS_THRU_END_OF(y - 1));
y = yg;
}
- dest->year = y;
+ dest->year = cpu_to_le16(y);
ip = __mon_yday[__isleap(y)];
for (y = 11; days < (long int)ip[y]; --y)
continue;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index e533b11703b..9fdf8c93c58 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -23,7 +23,7 @@
#include <linux/kernel.h>
#include <linux/string.h> /* for memset */
#include <linux/nls.h>
-#include <linux/udf_fs.h>
+#include <linux/crc-itu-t.h>
#include "udf_sb.h"
@@ -49,14 +49,16 @@ int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
{
int usesize;
- if ((!dest) || (!ptr) || (!size))
+ if (!dest || !ptr || !size)
return -1;
+ BUG_ON(size < 2);
- memset(dest, 0, sizeof(struct ustr));
- usesize = (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size;
+ usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
+ usesize = min(usesize, size - 2);
dest->u_cmpID = ptr[0];
- dest->u_len = ptr[size - 1];
- memcpy(dest->u_name, ptr + 1, usesize - 1);
+ dest->u_len = usesize;
+ memcpy(dest->u_name, ptr + 1, usesize);
+ memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
return 0;
}
@@ -83,9 +85,6 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
* PURPOSE
* Convert OSTA Compressed Unicode to the UTF-8 equivalent.
*
- * DESCRIPTION
- * This routine is only called by udf_filldir().
- *
* PRE-CONDITIONS
* utf Pointer to UTF-8 output buffer.
* ocu Pointer to OSTA Compressed Unicode input buffer
@@ -99,43 +98,39 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
* November 12, 1997 - Andrew E. Mileski
* Written, tested, and released.
*/
-int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i)
+int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
{
- uint8_t *ocu;
- uint32_t c;
+ const uint8_t *ocu;
uint8_t cmp_id, ocu_len;
int i;
- ocu = ocu_i->u_name;
-
ocu_len = ocu_i->u_len;
- cmp_id = ocu_i->u_cmpID;
- utf_o->u_len = 0;
-
if (ocu_len == 0) {
memset(utf_o, 0, sizeof(struct ustr));
- utf_o->u_cmpID = 0;
- utf_o->u_len = 0;
return 0;
}
- if ((cmp_id != 8) && (cmp_id != 16)) {
+ cmp_id = ocu_i->u_cmpID;
+ if (cmp_id != 8 && cmp_id != 16) {
+ memset(utf_o, 0, sizeof(struct ustr));
printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
cmp_id, ocu_i->u_name);
return 0;
}
+ ocu = ocu_i->u_name;
+ utf_o->u_len = 0;
for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
/* Expand OSTA compressed Unicode to Unicode */
- c = ocu[i++];
+ uint32_t c = ocu[i++];
if (cmp_id == 16)
c = (c << 8) | ocu[i++];
/* Compress Unicode to UTF-8 */
- if (c < 0x80U) {
+ if (c < 0x80U)
utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
- } else if (c < 0x800U) {
+ else if (c < 0x800U) {
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0xc0 | (c >> 6));
utf_o->u_name[utf_o->u_len++] =
@@ -255,35 +250,32 @@ error_out:
}
static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
- struct ustr *ocu_i)
+ const struct ustr *ocu_i)
{
- uint8_t *ocu;
- uint32_t c;
+ const uint8_t *ocu;
uint8_t cmp_id, ocu_len;
int i;
- ocu = ocu_i->u_name;
ocu_len = ocu_i->u_len;
- cmp_id = ocu_i->u_cmpID;
- utf_o->u_len = 0;
-
if (ocu_len == 0) {
memset(utf_o, 0, sizeof(struct ustr));
- utf_o->u_cmpID = 0;
- utf_o->u_len = 0;
return 0;
}
- if ((cmp_id != 8) && (cmp_id != 16)) {
+ cmp_id = ocu_i->u_cmpID;
+ if (cmp_id != 8 && cmp_id != 16) {
+ memset(utf_o, 0, sizeof(struct ustr));
printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
cmp_id, ocu_i->u_name);
return 0;
}
+ ocu = ocu_i->u_name;
+ utf_o->u_len = 0;
for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
/* Expand OSTA compressed Unicode to Unicode */
- c = ocu[i++];
+ uint32_t c = ocu[i++];
if (cmp_id == 16)
c = (c << 8) | ocu[i++];
@@ -463,7 +455,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
} else if (newIndex > 250)
newIndex = 250;
newName[newIndex++] = CRC_MARK;
- valueCRC = udf_crc(fidName, fidNameLen, 0);
+ valueCRC = crc_itu_t(0, fidName, fidNameLen);
newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
diff --git a/fs/utimes.c b/fs/utimes.c
index b18da9c0b97..a2bef77dc9c 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -2,6 +2,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/linkage.h>
+#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/stat.h>
@@ -59,6 +60,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
struct inode *inode;
struct iattr newattrs;
struct file *f = NULL;
+ struct vfsmount *mnt;
error = -EINVAL;
if (times && (!nsec_valid(times[0].tv_nsec) ||
@@ -79,18 +81,20 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
if (!f)
goto out;
dentry = f->f_path.dentry;
+ mnt = f->f_path.mnt;
} else {
error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
if (error)
goto out;
dentry = nd.path.dentry;
+ mnt = nd.path.mnt;
}
inode = dentry->d_inode;
- error = -EROFS;
- if (IS_RDONLY(inode))
+ error = mnt_want_write(mnt);
+ if (error)
goto dput_and_out;
/* Don't worry, the checks are done in inode_change_ok() */
@@ -98,7 +102,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
if (times) {
error = -EPERM;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
if (times[0].tv_nsec == UTIME_OMIT)
newattrs.ia_valid &= ~ATTR_ATIME;
@@ -118,22 +122,24 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
} else {
error = -EACCES;
if (IS_IMMUTABLE(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
if (!is_owner_or_cap(inode)) {
if (f) {
if (!(f->f_mode & FMODE_WRITE))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
} else {
error = vfs_permission(&nd, MAY_WRITE);
if (error)
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
}
}
}
mutex_lock(&inode->i_mutex);
error = notify_change(dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+mnt_drop_write_and_out:
+ mnt_drop_write(mnt);
dput_and_out:
if (f)
fput(f);
diff --git a/fs/xattr.c b/fs/xattr.c
index 3acab161546..89a942f07e1 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/xattr.h>
+#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/syscalls.h>
@@ -32,8 +33,6 @@ xattr_permission(struct inode *inode, const char *name, int mask)
* filesystem or on an immutable / append-only inode.
*/
if (mask & MAY_WRITE) {
- if (IS_RDONLY(inode))
- return -EROFS;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
}
@@ -262,7 +261,11 @@ sys_setxattr(char __user *path, char __user *name, void __user *value,
error = user_path_walk(path, &nd);
if (error)
return error;
- error = setxattr(nd.path.dentry, name, value, size, flags);
+ error = mnt_want_write(nd.path.mnt);
+ if (!error) {
+ error = setxattr(nd.path.dentry, name, value, size, flags);
+ mnt_drop_write(nd.path.mnt);
+ }
path_put(&nd.path);
return error;
}
@@ -277,7 +280,11 @@ sys_lsetxattr(char __user *path, char __user *name, void __user *value,
error = user_path_walk_link(path, &nd);
if (error)
return error;
- error = setxattr(nd.path.dentry, name, value, size, flags);
+ error = mnt_want_write(nd.path.mnt);
+ if (!error) {
+ error = setxattr(nd.path.dentry, name, value, size, flags);
+ mnt_drop_write(nd.path.mnt);
+ }
path_put(&nd.path);
return error;
}
@@ -295,7 +302,11 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
return error;
dentry = f->f_path.dentry;
audit_inode(NULL, dentry);
- error = setxattr(dentry, name, value, size, flags);
+ error = mnt_want_write(f->f_path.mnt);
+ if (!error) {
+ error = setxattr(dentry, name, value, size, flags);
+ mnt_drop_write(f->f_path.mnt);
+ }
fput(f);
return error;
}
@@ -482,7 +493,11 @@ sys_removexattr(char __user *path, char __user *name)
error = user_path_walk(path, &nd);
if (error)
return error;
- error = removexattr(nd.path.dentry, name);
+ error = mnt_want_write(nd.path.mnt);
+ if (!error) {
+ error = removexattr(nd.path.dentry, name);
+ mnt_drop_write(nd.path.mnt);
+ }
path_put(&nd.path);
return error;
}
@@ -496,7 +511,11 @@ sys_lremovexattr(char __user *path, char __user *name)
error = user_path_walk_link(path, &nd);
if (error)
return error;
- error = removexattr(nd.path.dentry, name);
+ error = mnt_want_write(nd.path.mnt);
+ if (!error) {
+ error = removexattr(nd.path.dentry, name);
+ mnt_drop_write(nd.path.mnt);
+ }
path_put(&nd.path);
return error;
}
@@ -513,7 +532,11 @@ sys_fremovexattr(int fd, char __user *name)
return error;
dentry = f->f_path.dentry;
audit_inode(NULL, dentry);
- error = removexattr(dentry, name);
+ error = mnt_want_write(f->f_path.mnt);
+ if (!error) {
+ error = removexattr(dentry, name);
+ mnt_drop_write(f->f_path.mnt);
+ }
fput(f);
return error;
}
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 35115bca036..524021ff543 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -35,18 +35,6 @@ config XFS_QUOTA
with or without the generic quota support enabled (CONFIG_QUOTA) -
they are completely independent subsystems.
-config XFS_SECURITY
- bool "XFS Security Label support"
- depends on XFS_FS
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute namespace for inode security
- labels in the XFS filesystem.
-
- If you are not using a security module that requires using
- extended attributes for inode security labels, say N.
-
config XFS_POSIX_ACL
bool "XFS POSIX ACL support"
depends on XFS_FS
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index e040f1ce1b6..9b1bb17a050 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -37,7 +37,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
#ifdef DEBUG
if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
printk(KERN_WARNING "Large %s attempt, size=%ld\n",
- __FUNCTION__, (long)size);
+ __func__, (long)size);
dump_stack();
}
#endif
@@ -52,7 +52,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
if (!(++retries % 100))
printk(KERN_ERR "XFS: possible memory allocation "
"deadlock in %s (mode:0x%x)\n",
- __FUNCTION__, lflags);
+ __func__, lflags);
congestion_wait(WRITE, HZ/50);
} while (1);
}
@@ -129,7 +129,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
if (!(++retries % 100))
printk(KERN_ERR "XFS: possible memory allocation "
"deadlock in %s (mode:0x%x)\n",
- __FUNCTION__, lflags);
+ __func__, lflags);
congestion_wait(WRITE, HZ/50);
} while (1);
}
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 2009e6d922c..3abe7e9ceb3 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -20,8 +20,8 @@
#include <linux/time.h>
#include <linux/wait.h>
+#include <linux/semaphore.h>
#include <asm/atomic.h>
-#include <asm/semaphore.h>
/*
* sema_t structure just maps to struct semaphore in Linux kernel.
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index e0519529c26..a55c3b26d84 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -243,8 +243,12 @@ xfs_end_bio_unwritten(
size_t size = ioend->io_size;
if (likely(!ioend->io_error)) {
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount))
- xfs_iomap_write_unwritten(ip, offset, size);
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ int error;
+ error = xfs_iomap_write_unwritten(ip, offset, size);
+ if (error)
+ ioend->io_error = error;
+ }
xfs_setfilesize(ioend);
}
xfs_destroy_ioend(ioend);
@@ -1532,9 +1536,9 @@ xfs_vm_bmap(
struct xfs_inode *ip = XFS_I(inode);
xfs_itrace_entry(XFS_I(inode));
- xfs_rwlock(ip, VRWLOCK_READ);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
- xfs_rwunlock(ip, VRWLOCK_READ);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
}
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e347bfd47c9..52f6846101d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -400,7 +400,7 @@ _xfs_buf_lookup_pages(
printk(KERN_ERR
"XFS: possible memory allocation "
"deadlock in %s (mode:0x%x)\n",
- __FUNCTION__, gfp_mask);
+ __func__, gfp_mask);
XFS_STATS_INC(xb_page_retries);
xfsbufd_wakeup(0, gfp_mask);
@@ -598,7 +598,7 @@ xfs_buf_get_flags(
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
printk(KERN_WARNING "%s: failed to map pages\n",
- __FUNCTION__);
+ __func__);
goto no_buffer;
}
}
@@ -778,7 +778,7 @@ xfs_buf_get_noaddr(
error = _xfs_buf_map_pages(bp, XBF_MAPPED);
if (unlikely(error)) {
printk(KERN_WARNING "%s: failed to map pages\n",
- __FUNCTION__);
+ __func__);
goto fail_free_mem;
}
@@ -1060,7 +1060,7 @@ xfs_buf_iostart(
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
xfs_buf_delwri_queue(bp, 1);
- return status;
+ return 0;
}
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a3d207de48b..841d7883528 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -387,11 +387,15 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
return error;
}
-static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
+/*
+ * No error can be returned from xfs_buf_iostart for delwri
+ * buffers as they are queued and no I/O is issued.
+ */
+static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
{
bp->b_strat = xfs_bdstrat_cb;
bp->b_fspriv3 = mp;
- return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
+ (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
}
#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index e7f3da61c6c..652721ce0ea 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -30,7 +30,7 @@ typedef struct cred {
extern struct cred *sys_cred;
/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static __inline int capable_cred(cred_t *cr, int cid)
+static inline int capable_cred(cred_t *cr, int cid)
{
return (cr == sys_cred) ? 1 : capable(cid);
}
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index ca4f66c4de1..265f0168ab7 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -22,6 +22,7 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
+#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_export.h"
@@ -30,8 +31,6 @@
#include "xfs_inode.h"
#include "xfs_vfsops.h"
-static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, };
-
/*
* Note that we only accept fileids which are long enough rather than allow
* the parent generation number to default to zero. XFS considers zero a
@@ -66,7 +65,7 @@ xfs_fs_encode_fh(
int len;
/* Directories don't need their parent encoded, they have ".." */
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) || !connectable)
fileid_type = FILEID_INO32_GEN;
else
fileid_type = FILEID_INO32_GEN_PARENT;
@@ -213,17 +212,16 @@ xfs_fs_get_parent(
struct dentry *child)
{
int error;
- bhv_vnode_t *cvp;
+ struct xfs_inode *cip;
struct dentry *parent;
- cvp = NULL;
- error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cvp);
+ error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip);
if (unlikely(error))
return ERR_PTR(-error);
- parent = d_alloc_anon(vn_to_inode(cvp));
+ parent = d_alloc_anon(cip->i_vnode);
if (unlikely(!parent)) {
- VN_RELE(cvp);
+ iput(cip->i_vnode);
return ERR_PTR(-ENOMEM);
}
return parent;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index edab1ffbb16..05905246434 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -469,16 +469,11 @@ xfs_file_open_exec(
struct inode *inode)
{
struct xfs_mount *mp = XFS_M(inode->i_sb);
+ struct xfs_inode *ip = XFS_I(inode);
- if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI)) {
- if (DM_EVENT_ENABLED(XFS_I(inode), DM_EVENT_READ)) {
- bhv_vnode_t *vp = vn_from_inode(inode);
-
- return -XFS_SEND_DATA(mp, DM_EVENT_READ,
- vp, 0, 0, 0, NULL);
- }
- }
-
+ if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI) &&
+ DM_EVENT_ENABLED(ip, DM_EVENT_READ))
+ return -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
return 0;
}
#endif /* HAVE_FOP_OPEN_EXEC */
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index ac6d34cc355..1eefe61f0e1 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -17,18 +17,7 @@
*/
#include "xfs.h"
#include "xfs_vnodeops.h"
-
-/*
- * The following six includes are needed so that we can include
- * xfs_inode.h. What a mess..
- */
#include "xfs_bmap_btree.h"
-#include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-
#include "xfs_inode.h"
int fs_noerr(void) { return 0; }
@@ -42,11 +31,10 @@ xfs_tosspages(
xfs_off_t last,
int fiopt)
{
- bhv_vnode_t *vp = XFS_ITOV(ip);
- struct inode *inode = vn_to_inode(vp);
+ struct address_space *mapping = ip->i_vnode->i_mapping;
- if (VN_CACHED(vp))
- truncate_inode_pages(inode->i_mapping, first);
+ if (mapping->nrpages)
+ truncate_inode_pages(mapping, first);
}
int
@@ -56,15 +44,14 @@ xfs_flushinval_pages(
xfs_off_t last,
int fiopt)
{
- bhv_vnode_t *vp = XFS_ITOV(ip);
- struct inode *inode = vn_to_inode(vp);
+ struct address_space *mapping = ip->i_vnode->i_mapping;
int ret = 0;
- if (VN_CACHED(vp)) {
+ if (mapping->nrpages) {
xfs_iflags_clear(ip, XFS_ITRUNCATED);
- ret = filemap_write_and_wait(inode->i_mapping);
+ ret = filemap_write_and_wait(mapping);
if (!ret)
- truncate_inode_pages(inode->i_mapping, first);
+ truncate_inode_pages(mapping, first);
}
return ret;
}
@@ -77,17 +64,16 @@ xfs_flush_pages(
uint64_t flags,
int fiopt)
{
- bhv_vnode_t *vp = XFS_ITOV(ip);
- struct inode *inode = vn_to_inode(vp);
+ struct address_space *mapping = ip->i_vnode->i_mapping;
int ret = 0;
int ret2;
- if (VN_DIRTY(vp)) {
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
xfs_iflags_clear(ip, XFS_ITRUNCATED);
- ret = filemap_fdatawrite(inode->i_mapping);
+ ret = filemap_fdatawrite(mapping);
if (flags & XFS_B_ASYNC)
return ret;
- ret2 = filemap_fdatawait(inode->i_mapping);
+ ret2 = filemap_fdatawait(mapping);
if (!ret)
ret = ret2;
}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f34bd010eb5..4ddb86b73c6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -535,8 +535,6 @@ xfs_attrmulti_attr_set(
char *kbuf;
int error = EFAULT;
- if (IS_RDONLY(inode))
- return -EROFS;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return EPERM;
if (len > XATTR_SIZE_MAX)
@@ -562,8 +560,6 @@ xfs_attrmulti_attr_remove(
char *name,
__uint32_t flags)
{
- if (IS_RDONLY(inode))
- return -EROFS;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return EPERM;
return xfs_attr_remove(XFS_I(inode), name, flags);
@@ -573,6 +569,7 @@ STATIC int
xfs_attrmulti_by_handle(
xfs_mount_t *mp,
void __user *arg,
+ struct file *parfilp,
struct inode *parinode)
{
int error;
@@ -626,13 +623,21 @@ xfs_attrmulti_by_handle(
&ops[i].am_length, ops[i].am_flags);
break;
case ATTR_OP_SET:
+ ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ if (ops[i].am_error)
+ break;
ops[i].am_error = xfs_attrmulti_attr_set(inode,
attr_name, ops[i].am_attrvalue,
ops[i].am_length, ops[i].am_flags);
+ mnt_drop_write(parfilp->f_path.mnt);
break;
case ATTR_OP_REMOVE:
+ ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ if (ops[i].am_error)
+ break;
ops[i].am_error = xfs_attrmulti_attr_remove(inode,
attr_name, ops[i].am_flags);
+ mnt_drop_write(parfilp->f_path.mnt);
break;
default:
ops[i].am_error = EINVAL;
@@ -651,314 +656,6 @@ xfs_attrmulti_by_handle(
return -error;
}
-/* prototypes for a few of the stack-hungry cases that have
- * their own functions. Functions are defined after their use
- * so gcc doesn't get fancy and inline them with -03 */
-
-STATIC int
-xfs_ioc_space(
- struct xfs_inode *ip,
- struct inode *inode,
- struct file *filp,
- int flags,
- unsigned int cmd,
- void __user *arg);
-
-STATIC int
-xfs_ioc_bulkstat(
- xfs_mount_t *mp,
- unsigned int cmd,
- void __user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry_v1(
- xfs_mount_t *mp,
- void __user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry(
- xfs_mount_t *mp,
- void __user *arg);
-
-STATIC int
-xfs_ioc_xattr(
- xfs_inode_t *ip,
- struct file *filp,
- unsigned int cmd,
- void __user *arg);
-
-STATIC int
-xfs_ioc_fsgetxattr(
- xfs_inode_t *ip,
- int attr,
- void __user *arg);
-
-STATIC int
-xfs_ioc_getbmap(
- struct xfs_inode *ip,
- int flags,
- unsigned int cmd,
- void __user *arg);
-
-STATIC int
-xfs_ioc_getbmapx(
- struct xfs_inode *ip,
- void __user *arg);
-
-int
-xfs_ioctl(
- xfs_inode_t *ip,
- struct file *filp,
- int ioflags,
- unsigned int cmd,
- void __user *arg)
-{
- struct inode *inode = filp->f_path.dentry->d_inode;
- xfs_mount_t *mp = ip->i_mount;
- int error;
-
- xfs_itrace_entry(XFS_I(inode));
- switch (cmd) {
-
- case XFS_IOC_ALLOCSP:
- case XFS_IOC_FREESP:
- case XFS_IOC_RESVSP:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP64:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP64:
- /*
- * Only allow the sys admin to reserve space unless
- * unwritten extents are enabled.
- */
- if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
-
- case XFS_IOC_DIOINFO: {
- struct dioattr da;
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
-
- da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
- da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
- if (copy_to_user(arg, &da, sizeof(da)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- case XFS_IOC_FSBULKSTAT_SINGLE:
- case XFS_IOC_FSBULKSTAT:
- case XFS_IOC_FSINUMBERS:
- return xfs_ioc_bulkstat(mp, cmd, arg);
-
- case XFS_IOC_FSGEOMETRY_V1:
- return xfs_ioc_fsgeometry_v1(mp, arg);
-
- case XFS_IOC_FSGEOMETRY:
- return xfs_ioc_fsgeometry(mp, arg);
-
- case XFS_IOC_GETVERSION:
- return put_user(inode->i_generation, (int __user *)arg);
-
- case XFS_IOC_FSGETXATTR:
- return xfs_ioc_fsgetxattr(ip, 0, arg);
- case XFS_IOC_FSGETXATTRA:
- return xfs_ioc_fsgetxattr(ip, 1, arg);
- case XFS_IOC_GETXFLAGS:
- case XFS_IOC_SETXFLAGS:
- case XFS_IOC_FSSETXATTR:
- return xfs_ioc_xattr(ip, filp, cmd, arg);
-
- case XFS_IOC_FSSETDM: {
- struct fsdmidata dmi;
-
- if (copy_from_user(&dmi, arg, sizeof(dmi)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
- dmi.fsd_dmstate);
- return -error;
- }
-
- case XFS_IOC_GETBMAP:
- case XFS_IOC_GETBMAPA:
- return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
-
- case XFS_IOC_GETBMAPX:
- return xfs_ioc_getbmapx(ip, arg);
-
- case XFS_IOC_FD_TO_HANDLE:
- case XFS_IOC_PATH_TO_HANDLE:
- case XFS_IOC_PATH_TO_FSHANDLE:
- return xfs_find_handle(cmd, arg);
-
- case XFS_IOC_OPEN_BY_HANDLE:
- return xfs_open_by_handle(mp, arg, filp, inode);
-
- case XFS_IOC_FSSETDM_BY_HANDLE:
- return xfs_fssetdm_by_handle(mp, arg, inode);
-
- case XFS_IOC_READLINK_BY_HANDLE:
- return xfs_readlink_by_handle(mp, arg, inode);
-
- case XFS_IOC_ATTRLIST_BY_HANDLE:
- return xfs_attrlist_by_handle(mp, arg, inode);
-
- case XFS_IOC_ATTRMULTI_BY_HANDLE:
- return xfs_attrmulti_by_handle(mp, arg, inode);
-
- case XFS_IOC_SWAPEXT: {
- error = xfs_swapext((struct xfs_swapext __user *)arg);
- return -error;
- }
-
- case XFS_IOC_FSCOUNTS: {
- xfs_fsop_counts_t out;
-
- error = xfs_fs_counts(mp, &out);
- if (error)
- return -error;
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- case XFS_IOC_SET_RESBLKS: {
- xfs_fsop_resblks_t inout;
- __uint64_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&inout, arg, sizeof(inout)))
- return -XFS_ERROR(EFAULT);
-
- /* input parameter is passed in resblks field of structure */
- in = inout.resblks;
- error = xfs_reserve_blocks(mp, &in, &inout);
- if (error)
- return -error;
-
- if (copy_to_user(arg, &inout, sizeof(inout)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- case XFS_IOC_GET_RESBLKS: {
- xfs_fsop_resblks_t out;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- error = xfs_reserve_blocks(mp, NULL, &out);
- if (error)
- return -error;
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -XFS_ERROR(EFAULT);
-
- return 0;
- }
-
- case XFS_IOC_FSGROWFSDATA: {
- xfs_growfs_data_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_growfs_data(mp, &in);
- return -error;
- }
-
- case XFS_IOC_FSGROWFSLOG: {
- xfs_growfs_log_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_growfs_log(mp, &in);
- return -error;
- }
-
- case XFS_IOC_FSGROWFSRT: {
- xfs_growfs_rt_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_growfs_rt(mp, &in);
- return -error;
- }
-
- case XFS_IOC_FREEZE:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (inode->i_sb->s_frozen == SB_UNFROZEN)
- freeze_bdev(inode->i_sb->s_bdev);
- return 0;
-
- case XFS_IOC_THAW:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (inode->i_sb->s_frozen != SB_UNFROZEN)
- thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
- return 0;
-
- case XFS_IOC_GOINGDOWN: {
- __uint32_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (get_user(in, (__uint32_t __user *)arg))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_fs_goingdown(mp, in);
- return -error;
- }
-
- case XFS_IOC_ERROR_INJECTION: {
- xfs_error_injection_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_errortag_add(in.errtag, mp);
- return -error;
- }
-
- case XFS_IOC_ERROR_CLEARALL:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- error = xfs_errortag_clearall(mp, 1);
- return -error;
-
- default:
- return -ENOTTY;
- }
-}
-
STATIC int
xfs_ioc_space(
struct xfs_inode *ip,
@@ -1179,85 +876,85 @@ xfs_ioc_fsgetxattr(
}
STATIC int
-xfs_ioc_xattr(
+xfs_ioc_fssetxattr(
xfs_inode_t *ip,
struct file *filp,
- unsigned int cmd,
void __user *arg)
{
struct fsxattr fa;
struct bhv_vattr *vattr;
- int error = 0;
+ int error;
int attr_flags;
- unsigned int flags;
+
+ if (copy_from_user(&fa, arg, sizeof(fa)))
+ return -EFAULT;
vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
if (unlikely(!vattr))
return -ENOMEM;
- switch (cmd) {
- case XFS_IOC_FSSETXATTR: {
- if (copy_from_user(&fa, arg, sizeof(fa))) {
- error = -EFAULT;
- break;
- }
+ attr_flags = 0;
+ if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+ attr_flags |= ATTR_NONBLOCK;
- attr_flags = 0;
- if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= ATTR_NONBLOCK;
+ vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
+ vattr->va_xflags = fa.fsx_xflags;
+ vattr->va_extsize = fa.fsx_extsize;
+ vattr->va_projid = fa.fsx_projid;
- vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
- vattr->va_xflags = fa.fsx_xflags;
- vattr->va_extsize = fa.fsx_extsize;
- vattr->va_projid = fa.fsx_projid;
+ error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+ if (!error)
+ vn_revalidate(XFS_ITOV(ip)); /* update flags */
+ kfree(vattr);
+ return 0;
+}
- error = xfs_setattr(ip, vattr, attr_flags, NULL);
- if (likely(!error))
- vn_revalidate(XFS_ITOV(ip)); /* update flags */
- error = -error;
- break;
- }
+STATIC int
+xfs_ioc_getxflags(
+ xfs_inode_t *ip,
+ void __user *arg)
+{
+ unsigned int flags;
- case XFS_IOC_GETXFLAGS: {
- flags = xfs_di2lxflags(ip->i_d.di_flags);
- if (copy_to_user(arg, &flags, sizeof(flags)))
- error = -EFAULT;
- break;
- }
+ flags = xfs_di2lxflags(ip->i_d.di_flags);
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ return -EFAULT;
+ return 0;
+}
- case XFS_IOC_SETXFLAGS: {
- if (copy_from_user(&flags, arg, sizeof(flags))) {
- error = -EFAULT;
- break;
- }
+STATIC int
+xfs_ioc_setxflags(
+ xfs_inode_t *ip,
+ struct file *filp,
+ void __user *arg)
+{
+ struct bhv_vattr *vattr;
+ unsigned int flags;
+ int attr_flags;
+ int error;
- if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
- FS_NOATIME_FL | FS_NODUMP_FL | \
- FS_SYNC_FL)) {
- error = -EOPNOTSUPP;
- break;
- }
+ if (copy_from_user(&flags, arg, sizeof(flags)))
+ return -EFAULT;
- attr_flags = 0;
- if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= ATTR_NONBLOCK;
+ if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+ FS_NOATIME_FL | FS_NODUMP_FL | \
+ FS_SYNC_FL))
+ return -EOPNOTSUPP;
- vattr->va_mask = XFS_AT_XFLAGS;
- vattr->va_xflags = xfs_merge_ioc_xflags(flags,
- xfs_ip2xflags(ip));
+ vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
+ if (unlikely(!vattr))
+ return -ENOMEM;
- error = xfs_setattr(ip, vattr, attr_flags, NULL);
- if (likely(!error))
- vn_revalidate(XFS_ITOV(ip)); /* update flags */
- error = -error;
- break;
- }
+ attr_flags = 0;
+ if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+ attr_flags |= ATTR_NONBLOCK;
- default:
- error = -ENOTTY;
- break;
- }
+ vattr->va_mask = XFS_AT_XFLAGS;
+ vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
+ error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+ if (likely(!error))
+ vn_revalidate(XFS_ITOV(ip)); /* update flags */
kfree(vattr);
return error;
}
@@ -1332,3 +1029,259 @@ xfs_ioc_getbmapx(
return 0;
}
+
+int
+xfs_ioctl(
+ xfs_inode_t *ip,
+ struct file *filp,
+ int ioflags,
+ unsigned int cmd,
+ void __user *arg)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ xfs_mount_t *mp = ip->i_mount;
+ int error;
+
+ xfs_itrace_entry(XFS_I(inode));
+ switch (cmd) {
+
+ case XFS_IOC_ALLOCSP:
+ case XFS_IOC_FREESP:
+ case XFS_IOC_RESVSP:
+ case XFS_IOC_UNRESVSP:
+ case XFS_IOC_ALLOCSP64:
+ case XFS_IOC_FREESP64:
+ case XFS_IOC_RESVSP64:
+ case XFS_IOC_UNRESVSP64:
+ /*
+ * Only allow the sys admin to reserve space unless
+ * unwritten extents are enabled.
+ */
+ if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+
+ case XFS_IOC_DIOINFO: {
+ struct dioattr da;
+ xfs_buftarg_t *target =
+ XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+
+ da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+ da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+
+ if (copy_to_user(arg, &da, sizeof(da)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+ }
+
+ case XFS_IOC_FSBULKSTAT_SINGLE:
+ case XFS_IOC_FSBULKSTAT:
+ case XFS_IOC_FSINUMBERS:
+ return xfs_ioc_bulkstat(mp, cmd, arg);
+
+ case XFS_IOC_FSGEOMETRY_V1:
+ return xfs_ioc_fsgeometry_v1(mp, arg);
+
+ case XFS_IOC_FSGEOMETRY:
+ return xfs_ioc_fsgeometry(mp, arg);
+
+ case XFS_IOC_GETVERSION:
+ return put_user(inode->i_generation, (int __user *)arg);
+
+ case XFS_IOC_FSGETXATTR:
+ return xfs_ioc_fsgetxattr(ip, 0, arg);
+ case XFS_IOC_FSGETXATTRA:
+ return xfs_ioc_fsgetxattr(ip, 1, arg);
+ case XFS_IOC_FSSETXATTR:
+ return xfs_ioc_fssetxattr(ip, filp, arg);
+ case XFS_IOC_GETXFLAGS:
+ return xfs_ioc_getxflags(ip, arg);
+ case XFS_IOC_SETXFLAGS:
+ return xfs_ioc_setxflags(ip, filp, arg);
+
+ case XFS_IOC_FSSETDM: {
+ struct fsdmidata dmi;
+
+ if (copy_from_user(&dmi, arg, sizeof(dmi)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
+ dmi.fsd_dmstate);
+ return -error;
+ }
+
+ case XFS_IOC_GETBMAP:
+ case XFS_IOC_GETBMAPA:
+ return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+
+ case XFS_IOC_GETBMAPX:
+ return xfs_ioc_getbmapx(ip, arg);
+
+ case XFS_IOC_FD_TO_HANDLE:
+ case XFS_IOC_PATH_TO_HANDLE:
+ case XFS_IOC_PATH_TO_FSHANDLE:
+ return xfs_find_handle(cmd, arg);
+
+ case XFS_IOC_OPEN_BY_HANDLE:
+ return xfs_open_by_handle(mp, arg, filp, inode);
+
+ case XFS_IOC_FSSETDM_BY_HANDLE:
+ return xfs_fssetdm_by_handle(mp, arg, inode);
+
+ case XFS_IOC_READLINK_BY_HANDLE:
+ return xfs_readlink_by_handle(mp, arg, inode);
+
+ case XFS_IOC_ATTRLIST_BY_HANDLE:
+ return xfs_attrlist_by_handle(mp, arg, inode);
+
+ case XFS_IOC_ATTRMULTI_BY_HANDLE:
+ return xfs_attrmulti_by_handle(mp, arg, filp, inode);
+
+ case XFS_IOC_SWAPEXT: {
+ error = xfs_swapext((struct xfs_swapext __user *)arg);
+ return -error;
+ }
+
+ case XFS_IOC_FSCOUNTS: {
+ xfs_fsop_counts_t out;
+
+ error = xfs_fs_counts(mp, &out);
+ if (error)
+ return -error;
+
+ if (copy_to_user(arg, &out, sizeof(out)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+ }
+
+ case XFS_IOC_SET_RESBLKS: {
+ xfs_fsop_resblks_t inout;
+ __uint64_t in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&inout, arg, sizeof(inout)))
+ return -XFS_ERROR(EFAULT);
+
+ /* input parameter is passed in resblks field of structure */
+ in = inout.resblks;
+ error = xfs_reserve_blocks(mp, &in, &inout);
+ if (error)
+ return -error;
+
+ if (copy_to_user(arg, &inout, sizeof(inout)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+ }
+
+ case XFS_IOC_GET_RESBLKS: {
+ xfs_fsop_resblks_t out;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ error = xfs_reserve_blocks(mp, NULL, &out);
+ if (error)
+ return -error;
+
+ if (copy_to_user(arg, &out, sizeof(out)))
+ return -XFS_ERROR(EFAULT);
+
+ return 0;
+ }
+
+ case XFS_IOC_FSGROWFSDATA: {
+ xfs_growfs_data_t in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&in, arg, sizeof(in)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_growfs_data(mp, &in);
+ return -error;
+ }
+
+ case XFS_IOC_FSGROWFSLOG: {
+ xfs_growfs_log_t in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&in, arg, sizeof(in)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_growfs_log(mp, &in);
+ return -error;
+ }
+
+ case XFS_IOC_FSGROWFSRT: {
+ xfs_growfs_rt_t in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&in, arg, sizeof(in)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_growfs_rt(mp, &in);
+ return -error;
+ }
+
+ case XFS_IOC_FREEZE:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (inode->i_sb->s_frozen == SB_UNFROZEN)
+ freeze_bdev(inode->i_sb->s_bdev);
+ return 0;
+
+ case XFS_IOC_THAW:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (inode->i_sb->s_frozen != SB_UNFROZEN)
+ thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
+ return 0;
+
+ case XFS_IOC_GOINGDOWN: {
+ __uint32_t in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(in, (__uint32_t __user *)arg))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_fs_goingdown(mp, in);
+ return -error;
+ }
+
+ case XFS_IOC_ERROR_INJECTION: {
+ xfs_error_injection_t in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&in, arg, sizeof(in)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_errortag_add(in.errtag, mp);
+ return -error;
+ }
+
+ case XFS_IOC_ERROR_CLEARALL:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ error = xfs_errortag_clearall(mp, 1);
+ return -error;
+
+ default:
+ return -ENOTTY;
+ }
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cc4abd3daa4..a1237dad643 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,12 +62,11 @@ void
xfs_synchronize_atime(
xfs_inode_t *ip)
{
- bhv_vnode_t *vp;
+ struct inode *inode = ip->i_vnode;
- vp = XFS_ITOV_NULL(ip);
- if (vp) {
- ip->i_d.di_atime.t_sec = (__int32_t)vp->i_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)vp->i_atime.tv_nsec;
+ if (inode) {
+ ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
}
}
@@ -80,11 +79,10 @@ void
xfs_mark_inode_dirty_sync(
xfs_inode_t *ip)
{
- bhv_vnode_t *vp;
+ struct inode *inode = ip->i_vnode;
- vp = XFS_ITOV_NULL(ip);
- if (vp)
- mark_inode_dirty_sync(vn_to_inode(vp));
+ if (inode)
+ mark_inode_dirty_sync(inode);
}
/*
@@ -157,13 +155,6 @@ xfs_ichgtime_fast(
*/
ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
- /*
- * We're not supposed to change timestamps in readonly-mounted
- * filesystems. Throw it away if anyone asks us.
- */
- if (unlikely(IS_RDONLY(inode)))
- return;
-
if (flags & XFS_ICHGTIME_MOD) {
tvp = &inode->i_mtime;
ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
@@ -215,66 +206,62 @@ xfs_validate_fields(
*/
STATIC int
xfs_init_security(
- bhv_vnode_t *vp,
+ struct inode *inode,
struct inode *dir)
{
- struct inode *ip = vn_to_inode(vp);
+ struct xfs_inode *ip = XFS_I(inode);
size_t length;
void *value;
char *name;
int error;
- error = security_inode_init_security(ip, dir, &name, &value, &length);
+ error = security_inode_init_security(inode, dir, &name,
+ &value, &length);
if (error) {
if (error == -EOPNOTSUPP)
return 0;
return -error;
}
- error = xfs_attr_set(XFS_I(ip), name, value,
- length, ATTR_SECURE);
+ error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
if (!error)
- xfs_iflags_set(XFS_I(ip), XFS_IMODIFIED);
+ xfs_iflags_set(ip, XFS_IMODIFIED);
kfree(name);
kfree(value);
return error;
}
-/*
- * Determine whether a process has a valid fs_struct (kernel daemons
- * like knfsd don't have an fs_struct).
- *
- * XXX(hch): nfsd is broken, better fix it instead.
- */
-STATIC_INLINE int
-xfs_has_fs_struct(struct task_struct *task)
+static void
+xfs_dentry_to_name(
+ struct xfs_name *namep,
+ struct dentry *dentry)
{
- return (task->fs != init_task.fs);
+ namep->name = dentry->d_name.name;
+ namep->len = dentry->d_name.len;
}
STATIC void
xfs_cleanup_inode(
struct inode *dir,
- bhv_vnode_t *vp,
+ struct inode *inode,
struct dentry *dentry,
int mode)
{
- struct dentry teardown = {};
+ struct xfs_name teardown;
/* Oh, the horror.
* If we can't add the ACL or we fail in
* xfs_init_security we must back out.
* ENOSPC can hit here, among other things.
*/
- teardown.d_inode = vn_to_inode(vp);
- teardown.d_name = dentry->d_name;
+ xfs_dentry_to_name(&teardown, dentry);
if (S_ISDIR(mode))
- xfs_rmdir(XFS_I(dir), &teardown);
+ xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode));
else
- xfs_remove(XFS_I(dir), &teardown);
- VN_RELE(vp);
+ xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
+ iput(inode);
}
STATIC int
@@ -284,9 +271,10 @@ xfs_vn_mknod(
int mode,
dev_t rdev)
{
- struct inode *ip;
- bhv_vnode_t *vp = NULL, *dvp = vn_from_inode(dir);
+ struct inode *inode;
+ struct xfs_inode *ip = NULL;
xfs_acl_t *default_acl = NULL;
+ struct xfs_name name;
attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS;
int error;
@@ -297,59 +285,67 @@ xfs_vn_mknod(
if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
return -EINVAL;
- if (unlikely(test_default_acl && test_default_acl(dvp))) {
+ if (test_default_acl && test_default_acl(dir)) {
if (!_ACL_ALLOC(default_acl)) {
return -ENOMEM;
}
- if (!_ACL_GET_DEFAULT(dvp, default_acl)) {
+ if (!_ACL_GET_DEFAULT(dir, default_acl)) {
_ACL_FREE(default_acl);
default_acl = NULL;
}
}
- if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current))
+ xfs_dentry_to_name(&name, dentry);
+
+ if (IS_POSIXACL(dir) && !default_acl)
mode &= ~current->fs->umask;
switch (mode & S_IFMT) {
- case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
rdev = sysv_encode_dev(rdev);
case S_IFREG:
- error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL);
+ error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
break;
case S_IFDIR:
- error = xfs_mkdir(XFS_I(dir), dentry, mode, &vp, NULL);
+ error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
break;
default:
error = EINVAL;
break;
}
- if (unlikely(!error)) {
- error = xfs_init_security(vp, dir);
- if (error)
- xfs_cleanup_inode(dir, vp, dentry, mode);
- }
+ if (unlikely(error))
+ goto out_free_acl;
- if (unlikely(default_acl)) {
- if (!error) {
- error = _ACL_INHERIT(vp, mode, default_acl);
- if (!error)
- xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED);
- else
- xfs_cleanup_inode(dir, vp, dentry, mode);
- }
+ inode = ip->i_vnode;
+
+ error = xfs_init_security(inode, dir);
+ if (unlikely(error))
+ goto out_cleanup_inode;
+
+ if (default_acl) {
+ error = _ACL_INHERIT(inode, mode, default_acl);
+ if (unlikely(error))
+ goto out_cleanup_inode;
+ xfs_iflags_set(ip, XFS_IMODIFIED);
_ACL_FREE(default_acl);
}
- if (likely(!error)) {
- ASSERT(vp);
- ip = vn_to_inode(vp);
- if (S_ISDIR(mode))
- xfs_validate_fields(ip);
- d_instantiate(dentry, ip);
- xfs_validate_fields(dir);
- }
+ if (S_ISDIR(mode))
+ xfs_validate_fields(inode);
+ d_instantiate(dentry, inode);
+ xfs_validate_fields(dir);
+ return -error;
+
+ out_cleanup_inode:
+ xfs_cleanup_inode(dir, inode, dentry, mode);
+ out_free_acl:
+ if (default_acl)
+ _ACL_FREE(default_acl);
return -error;
}
@@ -378,13 +374,15 @@ xfs_vn_lookup(
struct dentry *dentry,
struct nameidata *nd)
{
- bhv_vnode_t *cvp;
+ struct xfs_inode *cip;
+ struct xfs_name name;
int error;
if (dentry->d_name.len >= MAXNAMELEN)
return ERR_PTR(-ENAMETOOLONG);
- error = xfs_lookup(XFS_I(dir), dentry, &cvp);
+ xfs_dentry_to_name(&name, dentry);
+ error = xfs_lookup(XFS_I(dir), &name, &cip);
if (unlikely(error)) {
if (unlikely(error != ENOENT))
return ERR_PTR(-error);
@@ -392,7 +390,7 @@ xfs_vn_lookup(
return NULL;
}
- return d_splice_alias(vn_to_inode(cvp), dentry);
+ return d_splice_alias(cip->i_vnode, dentry);
}
STATIC int
@@ -401,23 +399,24 @@ xfs_vn_link(
struct inode *dir,
struct dentry *dentry)
{
- struct inode *ip; /* inode of guy being linked to */
- bhv_vnode_t *vp; /* vp of name being linked */
+ struct inode *inode; /* inode of guy being linked to */
+ struct xfs_name name;
int error;
- ip = old_dentry->d_inode; /* inode being linked to */
- vp = vn_from_inode(ip);
+ inode = old_dentry->d_inode;
+ xfs_dentry_to_name(&name, dentry);
- VN_HOLD(vp);
- error = xfs_link(XFS_I(dir), vp, dentry);
+ igrab(inode);
+ error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
if (unlikely(error)) {
- VN_RELE(vp);
- } else {
- xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
- xfs_validate_fields(ip);
- d_instantiate(dentry, ip);
+ iput(inode);
+ return -error;
}
- return -error;
+
+ xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+ xfs_validate_fields(inode);
+ d_instantiate(dentry, inode);
+ return 0;
}
STATIC int
@@ -426,11 +425,13 @@ xfs_vn_unlink(
struct dentry *dentry)
{
struct inode *inode;
+ struct xfs_name name;
int error;
inode = dentry->d_inode;
+ xfs_dentry_to_name(&name, dentry);
- error = xfs_remove(XFS_I(dir), dentry);
+ error = xfs_remove(XFS_I(dir), &name, XFS_I(inode));
if (likely(!error)) {
xfs_validate_fields(dir); /* size needs update */
xfs_validate_fields(inode);
@@ -444,29 +445,34 @@ xfs_vn_symlink(
struct dentry *dentry,
const char *symname)
{
- struct inode *ip;
- bhv_vnode_t *cvp; /* used to lookup symlink to put in dentry */
+ struct inode *inode;
+ struct xfs_inode *cip = NULL;
+ struct xfs_name name;
int error;
mode_t mode;
- cvp = NULL;
-
mode = S_IFLNK |
(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+ xfs_dentry_to_name(&name, dentry);
- error = xfs_symlink(XFS_I(dir), dentry, (char *)symname, mode,
- &cvp, NULL);
- if (likely(!error && cvp)) {
- error = xfs_init_security(cvp, dir);
- if (likely(!error)) {
- ip = vn_to_inode(cvp);
- d_instantiate(dentry, ip);
- xfs_validate_fields(dir);
- xfs_validate_fields(ip);
- } else {
- xfs_cleanup_inode(dir, cvp, dentry, 0);
- }
- }
+ error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
+ if (unlikely(error))
+ goto out;
+
+ inode = cip->i_vnode;
+
+ error = xfs_init_security(inode, dir);
+ if (unlikely(error))
+ goto out_cleanup_inode;
+
+ d_instantiate(dentry, inode);
+ xfs_validate_fields(dir);
+ xfs_validate_fields(inode);
+ return 0;
+
+ out_cleanup_inode:
+ xfs_cleanup_inode(dir, inode, dentry, 0);
+ out:
return -error;
}
@@ -476,9 +482,12 @@ xfs_vn_rmdir(
struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct xfs_name name;
int error;
- error = xfs_rmdir(XFS_I(dir), dentry);
+ xfs_dentry_to_name(&name, dentry);
+
+ error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode));
if (likely(!error)) {
xfs_validate_fields(inode);
xfs_validate_fields(dir);
@@ -494,12 +503,15 @@ xfs_vn_rename(
struct dentry *ndentry)
{
struct inode *new_inode = ndentry->d_inode;
- bhv_vnode_t *tvp; /* target directory */
+ struct xfs_name oname;
+ struct xfs_name nname;
int error;
- tvp = vn_from_inode(ndir);
+ xfs_dentry_to_name(&oname, odentry);
+ xfs_dentry_to_name(&nname, ndentry);
- error = xfs_rename(XFS_I(odir), odentry, tvp, ndentry);
+ error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+ XFS_I(ndir), &nname);
if (likely(!error)) {
if (new_inode)
xfs_validate_fields(new_inode);
@@ -700,11 +712,19 @@ xfs_vn_setattr(
return -error;
}
+/*
+ * block_truncate_page can return an error, but we can't propagate it
+ * at all here. Leave a complaint + stack trace in the syslog because
+ * this could be bad. If it is bad, we need to propagate the error further.
+ */
STATIC void
xfs_vn_truncate(
struct inode *inode)
{
- block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks);
+ int error;
+ error = block_truncate_page(inode->i_mapping, inode->i_size,
+ xfs_get_blocks);
+ WARN_ON(error);
}
STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 3ca39c4e5d2..e5143323e71 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -99,7 +99,6 @@
/*
* Feature macros (disable/enable)
*/
-#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */
#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */
#ifdef CONFIG_SMP
#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 16635338849..1ebd8004469 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,6 +51,7 @@
#include "xfs_vnodeops.h"
#include <linux/capability.h>
+#include <linux/mount.h>
#include <linux/writeback.h>
@@ -176,7 +177,6 @@ xfs_read(
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- bhv_vnode_t *vp = XFS_ITOV(ip);
xfs_mount_t *mp = ip->i_mount;
size_t size = 0;
ssize_t ret = 0;
@@ -228,11 +228,11 @@ xfs_read(
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
- bhv_vrwlock_t locktype = VRWLOCK_READ;
int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+ int iolock = XFS_IOLOCK_SHARED;
- ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *offset, size,
- dmflags, &locktype);
+ ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
+ dmflags, &iolock);
if (ret) {
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
if (unlikely(ioflags & IO_ISDIRECT))
@@ -242,7 +242,7 @@ xfs_read(
}
if (unlikely(ioflags & IO_ISDIRECT)) {
- if (VN_CACHED(vp))
+ if (inode->i_mapping->nrpages)
ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
-1, FI_REMAPF_LOCKED);
mutex_unlock(&inode->i_mutex);
@@ -276,7 +276,6 @@ xfs_splice_read(
int flags,
int ioflags)
{
- bhv_vnode_t *vp = XFS_ITOV(ip);
xfs_mount_t *mp = ip->i_mount;
ssize_t ret;
@@ -287,11 +286,11 @@ xfs_splice_read(
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
- bhv_vrwlock_t locktype = VRWLOCK_READ;
+ int iolock = XFS_IOLOCK_SHARED;
int error;
- error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *ppos, count,
- FILP_DELAY_FLAG(infilp), &locktype);
+ error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
+ FILP_DELAY_FLAG(infilp), &iolock);
if (error) {
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return -error;
@@ -317,7 +316,6 @@ xfs_splice_write(
int flags,
int ioflags)
{
- bhv_vnode_t *vp = XFS_ITOV(ip);
xfs_mount_t *mp = ip->i_mount;
ssize_t ret;
struct inode *inode = outfilp->f_mapping->host;
@@ -330,11 +328,11 @@ xfs_splice_write(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
- bhv_vrwlock_t locktype = VRWLOCK_WRITE;
+ int iolock = XFS_IOLOCK_EXCL;
int error;
- error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, *ppos, count,
- FILP_DELAY_FLAG(outfilp), &locktype);
+ error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
+ FILP_DELAY_FLAG(outfilp), &iolock);
if (error) {
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return -error;
@@ -573,14 +571,12 @@ xfs_write(
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- bhv_vnode_t *vp = XFS_ITOV(xip);
unsigned long segs = nsegs;
xfs_mount_t *mp;
ssize_t ret = 0, error = 0;
xfs_fsize_t isize, new_size;
int iolock;
int eventsent = 0;
- bhv_vrwlock_t locktype;
size_t ocount = 0, count;
loff_t pos;
int need_i_mutex;
@@ -607,11 +603,9 @@ xfs_write(
relock:
if (ioflags & IO_ISDIRECT) {
iolock = XFS_IOLOCK_SHARED;
- locktype = VRWLOCK_WRITE_DIRECT;
need_i_mutex = 0;
} else {
iolock = XFS_IOLOCK_EXCL;
- locktype = VRWLOCK_WRITE;
need_i_mutex = 1;
mutex_lock(&inode->i_mutex);
}
@@ -634,9 +628,8 @@ start:
dmflags |= DM_FLAGS_IMUX;
xfs_iunlock(xip, XFS_ILOCK_EXCL);
- error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
- pos, count,
- dmflags, &locktype);
+ error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
+ pos, count, dmflags, &iolock);
if (error) {
goto out_unlock_internal;
}
@@ -664,10 +657,9 @@ start:
return XFS_ERROR(-EINVAL);
}
- if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
+ if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
iolock = XFS_IOLOCK_EXCL;
- locktype = VRWLOCK_WRITE;
need_i_mutex = 1;
mutex_lock(&inode->i_mutex);
xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
@@ -679,10 +671,16 @@ start:
if (new_size > xip->i_size)
xip->i_new_size = new_size;
- if (likely(!(ioflags & IO_INVIS))) {
+ /*
+ * We're not supposed to change timestamps in readonly-mounted
+ * filesystems. Throw it away if anyone asks us.
+ */
+ if (likely(!(ioflags & IO_INVIS) &&
+ !mnt_want_write(file->f_path.mnt))) {
file_update_time(file);
xfs_ichgtime_fast(xip, inode,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ mnt_drop_write(file->f_path.mnt);
}
/*
@@ -727,7 +725,7 @@ retry:
current->backing_dev_info = mapping->backing_dev_info;
if ((ioflags & IO_ISDIRECT)) {
- if (VN_CACHED(vp)) {
+ if (mapping->nrpages) {
WARN_ON(need_i_mutex == 0);
xfs_inval_cached_trace(xip, pos, -1,
(pos & PAGE_CACHE_MASK), -1);
@@ -744,7 +742,6 @@ retry:
mutex_unlock(&inode->i_mutex);
iolock = XFS_IOLOCK_SHARED;
- locktype = VRWLOCK_WRITE_DIRECT;
need_i_mutex = 0;
}
@@ -781,15 +778,15 @@ retry:
if (ret == -ENOSPC &&
DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
- xfs_rwunlock(xip, locktype);
+ xfs_iunlock(xip, iolock);
if (need_i_mutex)
mutex_unlock(&inode->i_mutex);
- error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
- DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
+ error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
+ DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
0, 0, 0); /* Delay flag intentionally unused */
if (need_i_mutex)
mutex_lock(&inode->i_mutex);
- xfs_rwlock(xip, locktype);
+ xfs_ilock(xip, iolock);
if (error)
goto out_unlock_internal;
pos = xip->i_size;
@@ -817,7 +814,8 @@ retry:
/* Handle various SYNC-type writes */
if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
int error2;
- xfs_rwunlock(xip, locktype);
+
+ xfs_iunlock(xip, iolock);
if (need_i_mutex)
mutex_unlock(&inode->i_mutex);
error2 = sync_page_range(inode, mapping, pos, ret);
@@ -825,7 +823,7 @@ retry:
error = error2;
if (need_i_mutex)
mutex_lock(&inode->i_mutex);
- xfs_rwlock(xip, locktype);
+ xfs_ilock(xip, iolock);
error2 = xfs_write_sync_logforce(mp, xip);
if (!error)
error = error2;
@@ -846,7 +844,7 @@ retry:
xip->i_d.di_size = xip->i_size;
xfs_iunlock(xip, XFS_ILOCK_EXCL);
}
- xfs_rwunlock(xip, locktype);
+ xfs_iunlock(xip, iolock);
out_unlock_mutex:
if (need_i_mutex)
mutex_unlock(&inode->i_mutex);
@@ -884,28 +882,23 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
}
/*
- * Wrapper around bdstrat so that we can stop data
- * from going to disk in case we are shutting down the filesystem.
- * Typically user data goes thru this path; one of the exceptions
- * is the superblock.
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem. Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
*/
-int
+void
xfsbdstrat(
struct xfs_mount *mp,
struct xfs_buf *bp)
{
ASSERT(mp);
if (!XFS_FORCED_SHUTDOWN(mp)) {
- /* Grio redirection would go here
- * if (XFS_BUF_IS_GRIO(bp)) {
- */
-
xfs_buf_iorequest(bp);
- return 0;
+ return;
}
xfs_buftrace("XFSBDSTRAT IOERROR", bp);
- return (xfs_bioerror_relse(bp));
+ xfs_bioerror_relse(bp);
}
/*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index e200253139c..e1d498b4ba7 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -68,7 +68,8 @@ extern void xfs_inval_cached_trace(struct xfs_inode *,
#define xfs_inval_cached_trace(ip, offset, len, first, last)
#endif
-extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+/* errors from xfsbdstrat() must be extracted from the buffer */
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
extern int xfs_bdstrat_cb(struct xfs_buf *);
extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index 8ba7a2fa6c1..afd0b0d5fdb 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -144,8 +144,8 @@ extern void xfs_cleanup_procfs(void);
# define XFS_STATS_DEC(count)
# define XFS_STATS_ADD(count, inc)
-static __inline void xfs_init_procfs(void) { };
-static __inline void xfs_cleanup_procfs(void) { };
+static inline void xfs_init_procfs(void) { };
+static inline void xfs_cleanup_procfs(void) { };
#endif /* !CONFIG_PROC_FS */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 8831d951879..865eb708aa9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -896,7 +896,8 @@ xfs_fs_write_inode(
struct inode *inode,
int sync)
{
- int error = 0, flags = FLUSH_INODE;
+ int error = 0;
+ int flags = 0;
xfs_itrace_entry(XFS_I(inode));
if (sync) {
@@ -934,7 +935,7 @@ xfs_fs_clear_inode(
xfs_inactive(ip);
xfs_iflags_clear(ip, XFS_IMODIFIED);
if (xfs_reclaim(ip))
- panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, inode);
+ panic("%s: cannot reclaim 0x%p\n", __func__, inode);
}
ASSERT(XFS_I(inode) == NULL);
@@ -1027,8 +1028,7 @@ xfs_sync_worker(
int error;
if (!(mp->m_flags & XFS_MOUNT_RDONLY))
- error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR |
- SYNC_REFCACHE | SYNC_SUPER);
+ error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
mp->m_sync_seq++;
wake_up(&mp->m_wait_single_sync_task);
}
@@ -1306,7 +1306,7 @@ xfs_fs_fill_super(
void *data,
int silent)
{
- struct inode *rootvp;
+ struct inode *root;
struct xfs_mount *mp = NULL;
struct xfs_mount_args *args = xfs_args_allocate(sb, silent);
int error;
@@ -1344,19 +1344,18 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
- rootvp = igrab(mp->m_rootip->i_vnode);
- if (!rootvp) {
+ root = igrab(mp->m_rootip->i_vnode);
+ if (!root) {
error = ENOENT;
goto fail_unmount;
}
-
- sb->s_root = d_alloc_root(vn_to_inode(rootvp));
- if (!sb->s_root) {
- error = ENOMEM;
+ if (is_bad_inode(root)) {
+ error = EINVAL;
goto fail_vnrele;
}
- if (is_bad_inode(sb->s_root->d_inode)) {
- error = EINVAL;
+ sb->s_root = d_alloc_root(root);
+ if (!sb->s_root) {
+ error = ENOMEM;
goto fail_vnrele;
}
@@ -1378,7 +1377,7 @@ fail_vnrele:
dput(sb->s_root);
sb->s_root = NULL;
} else {
- VN_RELE(rootvp);
+ iput(root);
}
fail_unmount:
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 3efcf45b14a..3efb7c6d330 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -50,13 +50,7 @@ extern void xfs_qm_exit(void);
# define set_posix_acl_flag(sb) do { } while (0)
#endif
-#ifdef CONFIG_XFS_SECURITY
-# define XFS_SECURITY_STRING "security attributes, "
-# define ENOSECURITY 0
-#else
-# define XFS_SECURITY_STRING
-# define ENOSECURITY EOPNOTSUPP
-#endif
+#define XFS_SECURITY_STRING "security attributes, "
#ifdef CONFIG_XFS_RT
# define XFS_REALTIME_STRING "realtime, "
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 4da03a4e352..7e60c7776b1 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -49,7 +49,6 @@ typedef struct bhv_vfs_sync_work {
#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */
#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */
#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */
-#define SYNC_SUPER 0x0200 /* flush superblock to disk */
/*
* When remounting a filesystem read-only or freezing the filesystem,
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index b5ea418693b..8b4d63ce869 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -23,8 +23,6 @@ struct bhv_vattr;
struct xfs_iomap;
struct attrlist_cursor_kern;
-typedef struct dentry bhv_vname_t;
-typedef __u64 bhv_vnumber_t;
typedef struct inode bhv_vnode_t;
#define VN_ISLNK(vp) S_ISLNK((vp)->i_mode)
@@ -46,18 +44,6 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
}
/*
- * Values for the vop_rwlock/rwunlock flags parameter.
- */
-typedef enum bhv_vrwlock {
- VRWLOCK_NONE,
- VRWLOCK_READ,
- VRWLOCK_WRITE,
- VRWLOCK_WRITE_DIRECT,
- VRWLOCK_TRY_READ,
- VRWLOCK_TRY_WRITE
-} bhv_vrwlock_t;
-
-/*
* Return values for xfs_inactive. A return value of
* VN_INACTIVE_NOCACHE implies that the file system behavior
* has disassociated its state and bhv_desc_t from the vnode.
@@ -73,12 +59,9 @@ typedef enum bhv_vrwlock {
#define IO_INVIS 0x00020 /* don't update inode timestamps */
/*
- * Flags for vop_iflush call
+ * Flags for xfs_inode_flush
*/
#define FLUSH_SYNC 1 /* wait for flush to complete */
-#define FLUSH_INODE 2 /* flush the inode itself */
-#define FLUSH_LOG 4 /* force the last log entry for
- * this inode out to disk */
/*
* Flush/Invalidate options for vop_toss/flush/flushinval_pages.
@@ -226,13 +209,6 @@ static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
}
/*
- * Vname handling macros.
- */
-#define VNAME(dentry) ((char *) (dentry)->d_name.name)
-#define VNAMELEN(dentry) ((dentry)->d_name.len)
-#define VNAME_TO_VNODE(dentry) (vn_from_inode((dentry)->d_inode))
-
-/*
* Dealing with bad inodes
*/
static inline int VN_BAD(bhv_vnode_t *vp)
@@ -303,9 +279,9 @@ extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
#define xfs_itrace_entry(ip) \
- _xfs_itrace_entry(ip, __FUNCTION__, (inst_t *)__return_address)
+ _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
#define xfs_itrace_exit(ip) \
- _xfs_itrace_exit(ip, __FUNCTION__, (inst_t *)__return_address)
+ _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
#define xfs_itrace_exit_tag(ip, tag) \
_xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
#define xfs_itrace_ref(ip) \
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 665babcca6a..631ebb31b29 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1291,7 +1291,7 @@ xfs_qm_dqflush(
if (flags & XFS_QMOPT_DELWRI) {
xfs_bdwrite(mp, bp);
} else if (flags & XFS_QMOPT_ASYNC) {
- xfs_bawrite(mp, bp);
+ error = xfs_bawrite(mp, bp);
} else {
error = xfs_bwrite(mp, bp);
}
@@ -1439,9 +1439,7 @@ xfs_qm_dqpurge(
uint flags)
{
xfs_dqhash_t *thishash;
- xfs_mount_t *mp;
-
- mp = dqp->q_mount;
+ xfs_mount_t *mp = dqp->q_mount;
ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
@@ -1485,6 +1483,7 @@ xfs_qm_dqpurge(
* we're unmounting, we do care, so we flush it and wait.
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
+ int error;
xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
/* dqflush unlocks dqflock */
/*
@@ -1495,7 +1494,10 @@ xfs_qm_dqpurge(
* We don't care about getting disk errors here. We need
* to purge this dquot anyway, so we go ahead regardless.
*/
- (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+ error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+ if (error)
+ xfs_fs_cmn_err(CE_WARN, mp,
+ "xfs_qm_dqpurge: dquot %p flush failed", dqp);
xfs_dqflock(dqp);
}
ASSERT(dqp->q_pincount == 0);
@@ -1580,12 +1582,18 @@ xfs_qm_dqflock_pushbuf_wait(
XFS_INCORE_TRYLOCK);
if (bp != NULL) {
if (XFS_BUF_ISDELAYWRITE(bp)) {
+ int error;
if (XFS_BUF_ISPINNED(bp)) {
xfs_log_force(dqp->q_mount,
(xfs_lsn_t)0,
XFS_LOG_FORCE);
}
- xfs_bawrite(dqp->q_mount, bp);
+ error = xfs_bawrite(dqp->q_mount, bp);
+ if (error)
+ xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+ "xfs_qm_dqflock_pushbuf_wait: "
+ "pushbuf error %d on dqp %p, bp %p",
+ error, dqp, bp);
} else {
xfs_buf_relse(bp);
}
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1800e8d1f64..36e05ca7841 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -146,6 +146,7 @@ xfs_qm_dquot_logitem_push(
xfs_dq_logitem_t *logitem)
{
xfs_dquot_t *dqp;
+ int error;
dqp = logitem->qli_dquot;
@@ -161,7 +162,11 @@ xfs_qm_dquot_logitem_push(
* lock without sleeping, then there must not have been
* anyone in the process of flushing the dquot.
*/
- xfs_qm_dqflush(dqp, XFS_B_DELWRI);
+ error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+ if (error)
+ xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+ "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
+ error, dqp);
xfs_dqunlock(dqp);
}
@@ -262,11 +267,16 @@ xfs_qm_dquot_logitem_pushbuf(
XFS_LOG_FORCE);
}
if (dopush) {
+ int error;
#ifdef XFSRACEDEBUG
delay_for_intr();
delay(300);
#endif
- xfs_bawrite(mp, bp);
+ error = xfs_bawrite(mp, bp);
+ if (error)
+ xfs_fs_cmn_err(CE_WARN, mp,
+ "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
+ error, qip, bp);
} else {
xfs_buf_relse(bp);
}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8e9c5ae6504..40ea5640956 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -304,8 +304,11 @@ xfs_qm_unmount_quotadestroy(
* necessary data structures like quotainfo. This is also responsible for
* running a quotacheck as necessary. We are guaranteed that the superblock
* is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
*/
-int
+void
xfs_qm_mount_quotas(
xfs_mount_t *mp,
int mfsi_flags)
@@ -313,7 +316,6 @@ xfs_qm_mount_quotas(
int error = 0;
uint sbf;
-
/*
* If quotas on realtime volumes is not supported, we disable
* quotas immediately.
@@ -332,7 +334,8 @@ xfs_qm_mount_quotas(
* Allocate the quotainfo structure inside the mount struct, and
* create quotainode(s), and change/rev superblock if necessary.
*/
- if ((error = xfs_qm_init_quotainfo(mp))) {
+ error = xfs_qm_init_quotainfo(mp);
+ if (error) {
/*
* We must turn off quotas.
*/
@@ -344,12 +347,11 @@ xfs_qm_mount_quotas(
* If any of the quotas are not consistent, do a quotacheck.
*/
if (XFS_QM_NEED_QUOTACHECK(mp) &&
- !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
- if ((error = xfs_qm_quotacheck(mp))) {
- /* Quotacheck has failed and quotas have
- * been disabled.
- */
- return XFS_ERROR(error);
+ !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
+ error = xfs_qm_quotacheck(mp);
+ if (error) {
+ /* Quotacheck failed and disabled quotas. */
+ return;
}
}
/*
@@ -357,12 +359,10 @@ xfs_qm_mount_quotas(
* quotachecked status, since we won't be doing accounting for
* that type anymore.
*/
- if (!XFS_IS_UQUOTA_ON(mp)) {
+ if (!XFS_IS_UQUOTA_ON(mp))
mp->m_qflags &= ~XFS_UQUOTA_CHKD;
- }
- if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) {
+ if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
mp->m_qflags &= ~XFS_OQUOTA_CHKD;
- }
write_changes:
/*
@@ -392,7 +392,7 @@ xfs_qm_mount_quotas(
xfs_fs_cmn_err(CE_WARN, mp,
"Failed to initialize disk quotas.");
}
- return XFS_ERROR(error);
+ return;
}
/*
@@ -1438,7 +1438,7 @@ xfs_qm_qino_alloc(
}
-STATIC int
+STATIC void
xfs_qm_reset_dqcounts(
xfs_mount_t *mp,
xfs_buf_t *bp,
@@ -1478,8 +1478,6 @@ xfs_qm_reset_dqcounts(
ddq->d_rtbwarns = 0;
ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
}
-
- return 0;
}
STATIC int
@@ -1520,7 +1518,7 @@ xfs_qm_dqiter_bufs(
if (error)
break;
- (void) xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+ xfs_qm_reset_dqcounts(mp, bp, firstid, type);
xfs_bdwrite(mp, bp);
/*
* goto the next block.
@@ -1810,7 +1808,7 @@ xfs_qm_dqusage_adjust(
* Now release the inode. This will send it to 'inactive', and
* possibly even free blocks.
*/
- VN_RELE(XFS_ITOV(ip));
+ IRELE(ip);
/*
* Goto next inode.
@@ -1880,6 +1878,14 @@ xfs_qm_quotacheck(
} while (! done);
/*
+ * We've made all the changes that we need to make incore.
+ * Flush them down to disk buffers if everything was updated
+ * successfully.
+ */
+ if (!error)
+ error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+
+ /*
* We can get this error if we couldn't do a dquot allocation inside
* xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
* dirty dquots that might be cached, we just want to get rid of them
@@ -1890,11 +1896,6 @@ xfs_qm_quotacheck(
xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
goto error_return;
}
- /*
- * We've made all the changes that we need to make incore.
- * Now flush_them down to disk buffers.
- */
- xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
/*
* We didn't log anything, because if we crashed, we'll have to
@@ -1926,7 +1927,10 @@ xfs_qm_quotacheck(
ASSERT(mp->m_quotainfo != NULL);
ASSERT(xfs_Gqm != NULL);
xfs_qm_destroy_quotainfo(mp);
- (void)xfs_mount_reset_sbqflags(mp);
+ if (xfs_mount_reset_sbqflags(mp)) {
+ cmn_err(CE_WARN, "XFS quotacheck %s: "
+ "Failed to reset quota flags.", mp->m_fsname);
+ }
} else {
cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
}
@@ -1968,7 +1972,7 @@ xfs_qm_init_quotainos(
if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
0, 0, &gip, 0))) {
if (uip)
- VN_RELE(XFS_ITOV(uip));
+ IRELE(uip);
return XFS_ERROR(error);
}
}
@@ -1999,7 +2003,7 @@ xfs_qm_init_quotainos(
sbflags | XFS_SB_GQUOTINO, flags);
if (error) {
if (uip)
- VN_RELE(XFS_ITOV(uip));
+ IRELE(uip);
return XFS_ERROR(error);
}
@@ -2093,12 +2097,17 @@ xfs_qm_shake_freelist(
* dirty dquots.
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
+ int error;
xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
/*
* We flush it delayed write, so don't bother
* releasing the mplock.
*/
- (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+ error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+ if (error) {
+ xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+ "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+ }
xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
dqp = dqp->dq_flnext;
continue;
@@ -2265,12 +2274,17 @@ xfs_qm_dqreclaim_one(void)
* dirty dquots.
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
+ int error;
xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
/*
* We flush it delayed write, so don't bother
* releasing the freelist lock.
*/
- (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+ error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+ if (error) {
+ xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+ "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+ }
xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
continue;
}
@@ -2378,9 +2392,9 @@ xfs_qm_write_sb_changes(
}
xfs_mod_sb(tp, flags);
- (void) xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp, 0);
- return 0;
+ return error;
}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index baf537c1c17..cd2300e374a 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void xfs_qm_mount_quotas(xfs_mount_t *, int);
extern int xfs_qm_quotacheck(xfs_mount_t *);
extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *);
extern int xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
index a50ffabcf55..5b964fc0dc0 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -45,8 +45,8 @@ extern void xfs_qm_cleanup_procfs(void);
# define XQM_STATS_INC(count) do { } while (0)
-static __inline void xfs_qm_init_procfs(void) { };
-static __inline void xfs_qm_cleanup_procfs(void) { };
+static inline void xfs_qm_init_procfs(void) { };
+static inline void xfs_qm_cleanup_procfs(void) { };
#endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index d2b8be7e75f..8342823dbdc 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -279,9 +279,12 @@ xfs_qm_scall_quotaoff(
/*
* Write the LI_QUOTAOFF log record, and do SB changes atomically,
- * and synchronously.
+ * and synchronously. If we fail to write, we should abort the
+ * operation as it cannot be recovered safely if we crash.
*/
- xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+ error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+ if (error)
+ goto out_error;
/*
* Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -337,7 +340,12 @@ xfs_qm_scall_quotaoff(
* So, we have QUOTAOFF start and end logitems; the start
* logitem won't get overwritten until the end logitem appears...
*/
- xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+ error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+ if (error) {
+ /* We're screwed now. Shutdown is the only option. */
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ goto out_error;
+ }
/*
* If quotas is completely disabled, close shop.
@@ -361,6 +369,7 @@ xfs_qm_scall_quotaoff(
XFS_PURGE_INODE(XFS_QI_GQIP(mp));
XFS_QI_GQIP(mp) = NULL;
}
+out_error:
mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
return (error);
@@ -371,12 +380,11 @@ xfs_qm_scall_trunc_qfiles(
xfs_mount_t *mp,
uint flags)
{
- int error;
+ int error = 0, error2 = 0;
xfs_inode_t *qip;
if (!capable(CAP_SYS_ADMIN))
return XFS_ERROR(EPERM);
- error = 0;
if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
return XFS_ERROR(EINVAL);
@@ -384,22 +392,22 @@ xfs_qm_scall_trunc_qfiles(
if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
- if (! error) {
- (void) xfs_truncate_file(mp, qip);
- VN_RELE(XFS_ITOV(qip));
+ if (!error) {
+ error = xfs_truncate_file(mp, qip);
+ IRELE(qip);
}
}
if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
mp->m_sb.sb_gquotino != NULLFSINO) {
- error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
- if (! error) {
- (void) xfs_truncate_file(mp, qip);
- VN_RELE(XFS_ITOV(qip));
+ error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
+ if (!error2) {
+ error2 = xfs_truncate_file(mp, qip);
+ IRELE(qip);
}
}
- return (error);
+ return error ? error : error2;
}
@@ -552,13 +560,13 @@ xfs_qm_scall_getqstat(
out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
if (tempuqip)
- VN_RELE(XFS_ITOV(uip));
+ IRELE(uip);
}
if (gip) {
out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
if (tempgqip)
- VN_RELE(XFS_ITOV(gip));
+ IRELE(gip);
}
if (mp->m_quotainfo) {
out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
@@ -726,12 +734,12 @@ xfs_qm_scall_setqlim(
xfs_trans_log_dquot(tp, dqp);
xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
- xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp, 0);
xfs_qm_dqprint(dqp);
xfs_qm_dqrele(dqp);
mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
- return (0);
+ return error;
}
STATIC int
@@ -1095,7 +1103,7 @@ again:
* inactive code in hell.
*/
if (vnode_refd)
- VN_RELE(vp);
+ IRELE(ip);
XFS_MOUNT_ILOCK(mp);
/*
* If an inode was inserted or removed, we gotta
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 129067cfcb8..0b75d302508 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -24,7 +24,7 @@ static int ktrace_zentries;
void __init
ktrace_init(int zentries)
{
- ktrace_zentries = zentries;
+ ktrace_zentries = roundup_pow_of_two(zentries);
ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
"ktrace_hdr");
@@ -47,13 +47,16 @@ ktrace_uninit(void)
* ktrace_alloc()
*
* Allocate a ktrace header and enough buffering for the given
- * number of entries.
+ * number of entries. Round the number of entries up to a
+ * power of 2 so we can do fast masking to get the index from
+ * the atomic index counter.
*/
ktrace_t *
ktrace_alloc(int nentries, unsigned int __nocast sleep)
{
ktrace_t *ktp;
ktrace_entry_t *ktep;
+ int entries;
ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
@@ -70,11 +73,12 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
/*
* Special treatment for buffers with the ktrace_zentries entries
*/
- if (nentries == ktrace_zentries) {
+ entries = roundup_pow_of_two(nentries);
+ if (entries == ktrace_zentries) {
ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
sleep);
} else {
- ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
+ ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)),
sleep | KM_LARGE);
}
@@ -91,8 +95,10 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
}
ktp->kt_entries = ktep;
- ktp->kt_nentries = nentries;
- ktp->kt_index = 0;
+ ktp->kt_nentries = entries;
+ ASSERT(is_power_of_2(entries));
+ ktp->kt_index_mask = entries - 1;
+ atomic_set(&ktp->kt_index, 0);
ktp->kt_rollover = 0;
return ktp;
}
@@ -151,8 +157,6 @@ ktrace_enter(
void *val14,
void *val15)
{
- static DEFINE_SPINLOCK(wrap_lock);
- unsigned long flags;
int index;
ktrace_entry_t *ktep;
@@ -161,12 +165,8 @@ ktrace_enter(
/*
* Grab an entry by pushing the index up to the next one.
*/
- spin_lock_irqsave(&wrap_lock, flags);
- index = ktp->kt_index;
- if (++ktp->kt_index == ktp->kt_nentries)
- ktp->kt_index = 0;
- spin_unlock_irqrestore(&wrap_lock, flags);
-
+ index = atomic_add_return(1, &ktp->kt_index);
+ index = (index - 1) & ktp->kt_index_mask;
if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
ktp->kt_rollover = 1;
@@ -199,11 +199,12 @@ int
ktrace_nentries(
ktrace_t *ktp)
{
- if (ktp == NULL) {
+ int index;
+ if (ktp == NULL)
return 0;
- }
- return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
+ index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
+ return (ktp->kt_rollover ? ktp->kt_nentries : index);
}
/*
@@ -228,7 +229,7 @@ ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp)
int nentries;
if (ktp->kt_rollover)
- index = ktp->kt_index;
+ index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
else
index = 0;
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
index 56e72b40a85..741d6947ca6 100644
--- a/fs/xfs/support/ktrace.h
+++ b/fs/xfs/support/ktrace.h
@@ -30,7 +30,8 @@ typedef struct ktrace_entry {
*/
typedef struct ktrace {
int kt_nentries; /* number of entries in trace buf */
- int kt_index; /* current index in entries */
+ atomic_t kt_index; /* current index in entries */
+ unsigned int kt_index_mask;
int kt_rollover;
ktrace_entry_t *kt_entries; /* buffer of entries */
} ktrace_t;
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c98982..765aaf65e2d 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -22,7 +22,7 @@
#define STATIC
#define DEBUG 1
#define XFS_BUF_LOCK_TRACKING 1
-/* #define QUOTADEBUG 1 */
+#define QUOTADEBUG 1
#endif
#ifdef CONFIG_XFS_TRACE
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 7272fe39a92..8e130b9720a 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -307,12 +307,13 @@ xfs_acl_vset(
VN_HOLD(vp);
error = xfs_acl_allow_set(vp, kind);
- if (error)
- goto out;
/* Incoming ACL exists, set file mode based on its value */
- if (kind == _ACL_TYPE_ACCESS)
- xfs_acl_setmode(vp, xfs_acl, &basicperms);
+ if (!error && kind == _ACL_TYPE_ACCESS)
+ error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
+
+ if (error)
+ goto out;
/*
* If we have more than std unix permissions, set up the actual attr.
@@ -323,7 +324,7 @@ xfs_acl_vset(
if (!basicperms) {
xfs_acl_set_attr(vp, xfs_acl, kind, &error);
} else {
- xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
+ error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
}
out:
@@ -707,7 +708,9 @@ xfs_acl_inherit(
memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
xfs_acl_filter_mode(mode, cacl);
- xfs_acl_setmode(vp, cacl, &basicperms);
+ error = xfs_acl_setmode(vp, cacl, &basicperms);
+ if (error)
+ goto out_error;
/*
* Set the Default and Access ACL on the file. The mode is already
@@ -720,6 +723,7 @@ xfs_acl_inherit(
xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
if (!error && !basicperms)
xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
+out_error:
_ACL_FREE(cacl);
return error;
}
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdbfbbee495..1956f83489f 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -45,7 +45,7 @@
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-STATIC int
+STATIC void
xfs_alloc_search_busy(xfs_trans_t *tp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
@@ -55,24 +55,24 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
ktrace_t *xfs_alloc_trace_buf;
#define TRACE_ALLOC(s,a) \
- xfs_alloc_trace_alloc(__FUNCTION__, s, a, __LINE__)
+ xfs_alloc_trace_alloc(__func__, s, a, __LINE__)
#define TRACE_FREE(s,a,b,x,f) \
- xfs_alloc_trace_free(__FUNCTION__, s, mp, a, b, x, f, __LINE__)
+ xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__)
#define TRACE_MODAGF(s,a,f) \
- xfs_alloc_trace_modagf(__FUNCTION__, s, mp, a, f, __LINE__)
-#define TRACE_BUSY(__FUNCTION__,s,ag,agb,l,sl,tp) \
- xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
-#define TRACE_UNBUSY(__FUNCTION__,s,ag,sl,tp) \
- xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define TRACE_BUSYSEARCH(__FUNCTION__,s,ag,agb,l,sl,tp) \
- xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+ xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__)
+#define TRACE_BUSY(__func__,s,ag,agb,l,sl,tp) \
+ xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
+#define TRACE_UNBUSY(__func__,s,ag,sl,tp) \
+ xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
+#define TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp) \
+ xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
#else
#define TRACE_ALLOC(s,a)
#define TRACE_FREE(s,a,b,x,f)
#define TRACE_MODAGF(s,a,f)
#define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
#define TRACE_UNBUSY(fname,s,ag,sl,tp)
-#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
+#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp)
#endif /* XFS_ALLOC_TRACE */
/*
@@ -93,7 +93,7 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
* Compute aligned version of the found extent.
* Takes alignment and min length into account.
*/
-STATIC int /* success (>= minlen) */
+STATIC void
xfs_alloc_compute_aligned(
xfs_agblock_t foundbno, /* starting block in found extent */
xfs_extlen_t foundlen, /* length in found extent */
@@ -116,7 +116,6 @@ xfs_alloc_compute_aligned(
}
*resbno = bno;
*reslen = len;
- return len >= minlen;
}
/*
@@ -837,9 +836,9 @@ xfs_alloc_ag_vextent_near(
if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (!xfs_alloc_compute_aligned(ltbno, ltlen,
- args->alignment, args->minlen,
- &ltbnoa, &ltlena))
+ xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+ args->minlen, &ltbnoa, &ltlena);
+ if (ltlena < args->minlen)
continue;
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
@@ -958,9 +957,9 @@ xfs_alloc_ag_vextent_near(
if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (xfs_alloc_compute_aligned(ltbno, ltlen,
- args->alignment, args->minlen,
- &ltbnoa, &ltlena))
+ xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+ args->minlen, &ltbnoa, &ltlena);
+ if (ltlena >= args->minlen)
break;
if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
goto error0;
@@ -974,9 +973,9 @@ xfs_alloc_ag_vextent_near(
if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (xfs_alloc_compute_aligned(gtbno, gtlen,
- args->alignment, args->minlen,
- &gtbnoa, &gtlena))
+ xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
+ args->minlen, &gtbnoa, &gtlena);
+ if (gtlena >= args->minlen)
break;
if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
goto error0;
@@ -2562,9 +2561,10 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
/*
- * returns non-zero if any of (agno,bno):len is in a busy list
+ * If we find the extent in the busy list, force the log out to get the
+ * extent out of the busy list so the caller can use it straight away.
*/
-STATIC int
+STATIC void
xfs_alloc_search_busy(xfs_trans_t *tp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
@@ -2572,7 +2572,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
{
xfs_mount_t *mp;
xfs_perag_busy_t *bsy;
- int n;
xfs_agblock_t uend, bend;
xfs_lsn_t lsn;
int cnt;
@@ -2585,21 +2584,18 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
uend = bno + len - 1;
/* search pagb_list for this slot, skipping open slots */
- for (bsy = mp->m_perag[agno].pagb_list, n = 0;
- cnt; bsy++, n++) {
+ for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
/*
* (start1,length1) within (start2, length2)
*/
if (bsy->busy_tp != NULL) {
bend = bsy->busy_start + bsy->busy_length - 1;
- if ((bno > bend) ||
- (uend < bsy->busy_start)) {
+ if ((bno > bend) || (uend < bsy->busy_start)) {
cnt--;
} else {
TRACE_BUSYSEARCH("xfs_alloc_search_busy",
- "found1", agno, bno, len, n,
- tp);
+ "found1", agno, bno, len, tp);
break;
}
}
@@ -2610,15 +2606,12 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
* transaction that freed the block
*/
if (cnt) {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
+ TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
lsn = bsy->busy_tp->t_commit_lsn;
spin_unlock(&mp->m_perag[agno].pagb_lock);
xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
} else {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
- n = -1;
+ TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
spin_unlock(&mp->m_perag[agno].pagb_lock);
}
-
- return n;
}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e58f321fdae..36d781ee5fc 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2647,14 +2647,6 @@ attr_trusted_capable(
}
STATIC int
-attr_secure_capable(
- bhv_vnode_t *vp,
- cred_t *cred)
-{
- return -ENOSECURITY;
-}
-
-STATIC int
attr_system_set(
bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
{
@@ -2724,7 +2716,7 @@ struct attrnames attr_secure = {
.attr_get = attr_generic_get,
.attr_set = attr_generic_set,
.attr_remove = attr_generic_remove,
- .attr_capable = attr_secure_capable,
+ .attr_capable = (attrcapable_t)fs_noerr,
};
struct attrnames attr_user = {
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 96ba6aa4ed8..303d41e4217 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -166,7 +166,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
if (bytes <= XFS_IFORK_ASIZE(dp))
- return mp->m_attroffset >> 3;
+ return dp->i_d.di_forkoff;
return 0;
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 2def273855a..eb198c01c35 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -323,13 +323,13 @@ xfs_bmap_trace_pre_update(
int whichfork); /* data or attr fork */
#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) \
- xfs_bmap_trace_delete(__FUNCTION__,d,ip,i,c,w)
+ xfs_bmap_trace_delete(__func__,d,ip,i,c,w)
#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \
- xfs_bmap_trace_insert(__FUNCTION__,d,ip,i,c,r1,r2,w)
+ xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w)
#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) \
- xfs_bmap_trace_post_update(__FUNCTION__,d,ip,i,w)
+ xfs_bmap_trace_post_update(__func__,d,ip,i,w)
#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) \
- xfs_bmap_trace_pre_update(__FUNCTION__,d,ip,i,w)
+ xfs_bmap_trace_pre_update(__func__,d,ip,i,w)
#else
#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
@@ -2402,7 +2402,7 @@ xfs_bmap_extsize_align(
#define XFS_ALLOC_GAP_UNITS 4
-STATIC int
+STATIC void
xfs_bmap_adjacent(
xfs_bmalloca_t *ap) /* bmap alloc argument struct */
{
@@ -2548,7 +2548,6 @@ xfs_bmap_adjacent(
ap->rval = gotbno;
}
#undef ISVALID
- return 0;
}
STATIC int
@@ -4154,16 +4153,21 @@ xfs_bmap_compute_maxlevels(
* number of leaf entries, is controlled by the type of di_nextents
* (a signed 32-bit number, xfs_extnum_t), or by di_anextents
* (a signed 16-bit number, xfs_aextnum_t).
+ *
+ * Note that we can no longer assume that if we are in ATTR1 that
+ * the fork offset of all the inodes will be (m_attroffset >> 3)
+ * because we could have mounted with ATTR2 and then mounted back
+ * with ATTR1, keeping the di_forkoff's fixed but probably at
+ * various positions. Therefore, for both ATTR1 and ATTR2
+ * we have to assume the worst case scenario of a minimum size
+ * available.
*/
if (whichfork == XFS_DATA_FORK) {
maxleafents = MAXEXTNUM;
- sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
- XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset;
+ sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
} else {
maxleafents = MAXAEXTNUM;
- sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
- XFS_BMDR_SPACE_CALC(MINABTPTRS) :
- mp->m_sb.sb_inodesize - mp->m_attroffset;
+ sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
}
maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
minleafrecs = mp->m_bmap_dmnr[0];
@@ -5772,7 +5776,6 @@ xfs_getbmap(
int error; /* return value */
__int64_t fixlen; /* length for -1 case */
int i; /* extent number */
- bhv_vnode_t *vp; /* corresponding vnode */
int lock; /* lock state */
xfs_bmbt_irec_t *map; /* buffer for user's data */
xfs_mount_t *mp; /* file system mount point */
@@ -5789,7 +5792,6 @@ xfs_getbmap(
int bmapi_flags; /* flags for xfs_bmapi */
__int32_t oflags; /* getbmapx bmv_oflags field */
- vp = XFS_ITOV(ip);
mp = ip->i_mount;
whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
@@ -5811,7 +5813,7 @@ xfs_getbmap(
if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
whichfork == XFS_DATA_FORK) {
- error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL);
+ error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
if (error)
return XFS_ERROR(error);
}
@@ -5869,6 +5871,10 @@ xfs_getbmap(
/* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
error = xfs_flush_pages(ip, (xfs_off_t)0,
-1, 0, FI_REMAPF);
+ if (error) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return error;
+ }
}
ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
@@ -6162,10 +6168,10 @@ xfs_check_block(
}
if (*thispa == *pp) {
cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
- __FUNCTION__, j, i,
+ __func__, j, i,
(unsigned long long)be64_to_cpu(*thispa));
panic("%s: ptrs are equal in node\n",
- __FUNCTION__);
+ __func__);
}
}
}
@@ -6192,7 +6198,7 @@ xfs_bmap_check_leaf_extents(
xfs_mount_t *mp; /* file system mount structure */
__be64 *pp; /* pointer to block address */
xfs_bmbt_rec_t *ep; /* pointer to current extent */
- xfs_bmbt_rec_t *lastp; /* pointer to previous extent */
+ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
xfs_bmbt_rec_t *nextp; /* pointer to next extent */
int bp_release = 0;
@@ -6262,7 +6268,6 @@ xfs_bmap_check_leaf_extents(
/*
* Loop over all leaf nodes checking that all extents are in the right order.
*/
- lastp = NULL;
for (;;) {
xfs_fsblock_t nextbno;
xfs_extnum_t num_recs;
@@ -6283,18 +6288,16 @@ xfs_bmap_check_leaf_extents(
*/
ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+ if (i) {
+ xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+ }
for (j = 1; j < num_recs; j++) {
nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
- if (lastp) {
- xfs_btree_check_rec(XFS_BTNUM_BMAP,
- (void *)lastp, (void *)ep);
- }
- xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
- (void *)(nextp));
- lastp = ep;
+ xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
ep = nextp;
}
+ last = *ep;
i += num_recs;
if (bp_release) {
bp_release = 0;
@@ -6325,13 +6328,13 @@ xfs_bmap_check_leaf_extents(
return;
error0:
- cmn_err(CE_WARN, "%s: at error0", __FUNCTION__);
+ cmn_err(CE_WARN, "%s: at error0", __func__);
if (bp_release)
xfs_trans_brelse(NULL, bp);
error_norelse:
cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
- __FUNCTION__, i);
- panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__);
+ __func__, i);
+ panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
return;
}
#endif
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 87224b7d798..6ff70cda451 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -151,7 +151,7 @@ xfs_bmap_trace_exlist(
xfs_extnum_t cnt, /* count of entries in list */
int whichfork); /* data or attr fork */
#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
- xfs_bmap_trace_exlist(__FUNCTION__,ip,c,w)
+ xfs_bmap_trace_exlist(__func__,ip,c,w)
#else
#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
#endif
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bd18987326a..4f0e849d973 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -275,21 +275,21 @@ xfs_bmbt_trace_cursor(
}
#define XFS_BMBT_TRACE_ARGBI(c,b,i) \
- xfs_bmbt_trace_argbi(__FUNCTION__, c, b, i, __LINE__)
+ xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \
- xfs_bmbt_trace_argbii(__FUNCTION__, c, b, i, j, __LINE__)
+ xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \
- xfs_bmbt_trace_argfffi(__FUNCTION__, c, o, b, i, j, __LINE__)
+ xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
#define XFS_BMBT_TRACE_ARGI(c,i) \
- xfs_bmbt_trace_argi(__FUNCTION__, c, i, __LINE__)
+ xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \
- xfs_bmbt_trace_argifk(__FUNCTION__, c, i, f, s, __LINE__)
+ xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \
- xfs_bmbt_trace_argifr(__FUNCTION__, c, i, f, r, __LINE__)
+ xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
#define XFS_BMBT_TRACE_ARGIK(c,i,k) \
- xfs_bmbt_trace_argik(__FUNCTION__, c, i, k, __LINE__)
+ xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
#define XFS_BMBT_TRACE_CURSOR(c,s) \
- xfs_bmbt_trace_cursor(__FUNCTION__, c, s, __LINE__)
+ xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
#else
#define XFS_BMBT_TRACE_ARGBI(c,b,i)
#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
@@ -2027,6 +2027,24 @@ xfs_bmbt_increment(
/*
* Insert the current record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor. It appears, however, that some callers assume that the cursor is
+ * always valid. Hence if we do a multi-level split we need to revalidate the
+ * cursor.
+ *
+ * When a split occurs, we will see a new cursor returned. Use that as a
+ * trigger to determine if we need to revalidate the original cursor. If we get
+ * a split, then use the original irec to lookup up the path of the record we
+ * just inserted.
+ *
+ * Note that the fact that the btree root is in the inode means that we can
+ * have the level of the tree change without a "split" occurring at the root
+ * level. What happens is that the root is migrated to an allocated block and
+ * the inode root is pointed to it. This means a single split can change the
+ * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
+ * the level change should be accounted as a split so as to correctly trigger a
+ * revalidation of the old cursor.
*/
int /* error */
xfs_bmbt_insert(
@@ -2039,11 +2057,14 @@ xfs_bmbt_insert(
xfs_fsblock_t nbno;
xfs_btree_cur_t *ncur;
xfs_bmbt_rec_t nrec;
+ xfs_bmbt_irec_t oirec; /* original irec */
xfs_btree_cur_t *pcur;
+ int splits = 0;
XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
level = 0;
nbno = NULLFSBLOCK;
+ oirec = cur->bc_rec.b;
xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
ncur = NULL;
pcur = cur;
@@ -2052,11 +2073,13 @@ xfs_bmbt_insert(
&i))) {
if (pcur != cur)
xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
+ goto error0;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
+ /* allocating a new root is effectively a split */
+ if (cur->bc_nlevels != pcur->bc_nlevels)
+ splits++;
cur->bc_nlevels = pcur->bc_nlevels;
cur->bc_private.b.allocated +=
pcur->bc_private.b.allocated;
@@ -2070,10 +2093,21 @@ xfs_bmbt_insert(
xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
}
if (ncur) {
+ splits++;
pcur = ncur;
ncur = NULL;
}
} while (nbno != NULLFSBLOCK);
+
+ if (splits > 1) {
+ /* revalidate the old cursor as we had a multi-level split */
+ error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
+ oirec.br_startblock, oirec.br_blockcount, &i);
+ if (error)
+ goto error0;
+ ASSERT(i == 1);
+ }
+
XFS_BMBT_TRACE_CURSOR(cur, EXIT);
*stat = i;
return 0;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 63debd147eb..53a71c62025 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -645,7 +645,12 @@ xfs_buf_item_push(
bp = bip->bli_buf;
if (XFS_BUF_ISDELAYWRITE(bp)) {
- xfs_bawrite(bip->bli_item.li_mountp, bp);
+ int error;
+ error = xfs_bawrite(bip->bli_item.li_mountp, bp);
+ if (error)
+ xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
+ "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
+ error, bip, bp);
} else {
xfs_buf_relse(bp);
}
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index e92e73f0e6a..7cb26529766 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,6 +44,7 @@
#include "xfs_error.h"
#include "xfs_vnodeops.h"
+struct xfs_name xfs_name_dotdot = {"..", 2};
void
xfs_dir_mount(
@@ -146,8 +147,7 @@ int
xfs_dir_createname(
xfs_trans_t *tp,
xfs_inode_t *dp,
- char *name,
- int namelen,
+ struct xfs_name *name,
xfs_ino_t inum, /* new entry inode number */
xfs_fsblock_t *first, /* bmap's firstblock */
xfs_bmap_free_t *flist, /* bmap's freeblock list */
@@ -162,9 +162,9 @@ xfs_dir_createname(
return rval;
XFS_STATS_INC(xs_dir_create);
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = xfs_da_hashname(name->name, name->len);
args.inumber = inum;
args.dp = dp;
args.firstblock = first;
@@ -197,8 +197,7 @@ int
xfs_dir_lookup(
xfs_trans_t *tp,
xfs_inode_t *dp,
- char *name,
- int namelen,
+ struct xfs_name *name,
xfs_ino_t *inum) /* out: inode number */
{
xfs_da_args_t args;
@@ -207,18 +206,14 @@ xfs_dir_lookup(
ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
XFS_STATS_INC(xs_dir_lookup);
+ memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = 0;
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = xfs_da_hashname(name->name, name->len);
args.dp = dp;
- args.firstblock = NULL;
- args.flist = NULL;
- args.total = 0;
args.whichfork = XFS_DATA_FORK;
args.trans = tp;
- args.justcheck = args.addname = 0;
args.oknoent = 1;
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
@@ -247,8 +242,7 @@ int
xfs_dir_removename(
xfs_trans_t *tp,
xfs_inode_t *dp,
- char *name,
- int namelen,
+ struct xfs_name *name,
xfs_ino_t ino,
xfs_fsblock_t *first, /* bmap's firstblock */
xfs_bmap_free_t *flist, /* bmap's freeblock list */
@@ -261,9 +255,9 @@ xfs_dir_removename(
ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
XFS_STATS_INC(xs_dir_remove);
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = xfs_da_hashname(name->name, name->len);
args.inumber = ino;
args.dp = dp;
args.firstblock = first;
@@ -329,8 +323,7 @@ int
xfs_dir_replace(
xfs_trans_t *tp,
xfs_inode_t *dp,
- char *name, /* name of entry to replace */
- int namelen,
+ struct xfs_name *name, /* name of entry to replace */
xfs_ino_t inum, /* new inode number */
xfs_fsblock_t *first, /* bmap's firstblock */
xfs_bmap_free_t *flist, /* bmap's freeblock list */
@@ -345,9 +338,9 @@ xfs_dir_replace(
if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
return rval;
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = xfs_da_hashname(name->name, name->len);
args.inumber = inum;
args.dp = dp;
args.firstblock = first;
@@ -374,28 +367,29 @@ xfs_dir_replace(
/*
* See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
*/
int
xfs_dir_canenter(
xfs_trans_t *tp,
xfs_inode_t *dp,
- char *name, /* name of entry to add */
- int namelen)
+ struct xfs_name *name, /* name of entry to add */
+ uint resblks)
{
xfs_da_args_t args;
int rval;
int v; /* type-checking value */
+ if (resblks)
+ return 0;
+
ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = 0;
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = xfs_da_hashname(name->name, name->len);
args.dp = dp;
- args.firstblock = NULL;
- args.flist = NULL;
- args.total = 0;
args.whichfork = XFS_DATA_FORK;
args.trans = tp;
args.justcheck = args.addname = args.oknoent = 1;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index b265197e74c..6392f939029 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -59,6 +59,8 @@ typedef __uint32_t xfs_dir2_db_t;
*/
typedef xfs_off_t xfs_dir2_off_t;
+extern struct xfs_name xfs_name_dotdot;
+
/*
* Generic directory interface routines
*/
@@ -68,21 +70,21 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
- char *name, int namelen, xfs_ino_t inum,
+ struct xfs_name *name, xfs_ino_t inum,
xfs_fsblock_t *first,
struct xfs_bmap_free *flist, xfs_extlen_t tot);
extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
- char *name, int namelen, xfs_ino_t *inum);
+ struct xfs_name *name, xfs_ino_t *inum);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
- char *name, int namelen, xfs_ino_t ino,
+ struct xfs_name *name, xfs_ino_t ino,
xfs_fsblock_t *first,
struct xfs_bmap_free *flist, xfs_extlen_t tot);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
- char *name, int namelen, xfs_ino_t inum,
+ struct xfs_name *name, xfs_ino_t inum,
xfs_fsblock_t *first,
struct xfs_bmap_free *flist, xfs_extlen_t tot);
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
- char *name, int namelen);
+ struct xfs_name *name, uint resblks);
extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
/*
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index eb03eab5ca5..3f3785b1080 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -73,7 +73,7 @@ xfs_filestreams_trace(
#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
- xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \
+ xfs_filestreams_trace(mp, t, __func__, __LINE__, \
(__psunsigned_t)a0, (__psunsigned_t)a1, \
(__psunsigned_t)a2, (__psunsigned_t)a3, \
(__psunsigned_t)a4, (__psunsigned_t)a5)
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5a146cb2298..a64dfbd565a 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -107,6 +107,16 @@ xfs_ialloc_log_di(
/*
* Allocation group level functions.
*/
+static inline int
+xfs_ialloc_cluster_alignment(
+ xfs_alloc_arg_t *args)
+{
+ if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+ args->mp->m_sb.sb_inoalignmt >=
+ XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+ return args->mp->m_sb.sb_inoalignmt;
+ return 1;
+}
/*
* Allocate new inodes in the allocation group specified by agbp.
@@ -167,10 +177,24 @@ xfs_ialloc_ag_alloc(
args.mod = args.total = args.wasdel = args.isfl =
args.userdata = args.minalignslop = 0;
args.prod = 1;
- args.alignment = 1;
+
/*
- * Allow space for the inode btree to split.
+ * We need to take into account alignment here to ensure that
+ * we don't modify the free list if we fail to have an exact
+ * block. If we don't have an exact match, and every oher
+ * attempt allocation attempt fails, we'll end up cancelling
+ * a dirty transaction and shutting down.
+ *
+ * For an exact allocation, alignment must be 1,
+ * however we need to take cluster alignment into account when
+ * fixing up the freelist. Use the minalignslop field to
+ * indicate that extra blocks might be required for alignment,
+ * but not to use them in the actual exact allocation.
*/
+ args.alignment = 1;
+ args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+
+ /* Allow space for the inode btree to split. */
args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -191,13 +215,8 @@ xfs_ialloc_ag_alloc(
ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
args.alignment = args.mp->m_dalign;
isaligned = 1;
- } else if (xfs_sb_version_hasalign(&args.mp->m_sb) &&
- args.mp->m_sb.sb_inoalignmt >=
- XFS_B_TO_FSBT(args.mp,
- XFS_INODE_CLUSTER_SIZE(args.mp)))
- args.alignment = args.mp->m_sb.sb_inoalignmt;
- else
- args.alignment = 1;
+ } else
+ args.alignment = xfs_ialloc_cluster_alignment(&args);
/*
* Need to figure out where to allocate the inode blocks.
* Ideally they should be spaced out through the a.g.
@@ -230,12 +249,7 @@ xfs_ialloc_ag_alloc(
args.agbno = be32_to_cpu(agi->agi_root);
args.fsbno = XFS_AGB_TO_FSB(args.mp,
be32_to_cpu(agi->agi_seqno), args.agbno);
- if (xfs_sb_version_hasalign(&args.mp->m_sb) &&
- args.mp->m_sb.sb_inoalignmt >=
- XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
- args.alignment = args.mp->m_sb.sb_inoalignmt;
- else
- args.alignment = 1;
+ args.alignment = xfs_ialloc_cluster_alignment(&args);
if ((error = xfs_alloc_vextent(&args)))
return error;
}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8e09b71f410..e657c512846 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -78,7 +78,6 @@ xfs_iget_core(
xfs_inode_t *ip;
xfs_inode_t *iq;
int error;
- xfs_icluster_t *icl, *new_icl = NULL;
unsigned long first_index, mask;
xfs_perag_t *pag;
xfs_agino_t agino;
@@ -229,11 +228,9 @@ finish_inode:
}
/*
- * This is a bit messy - we preallocate everything we _might_
- * need before we pick up the ici lock. That way we don't have to
- * juggle locks and go all the way back to the start.
+ * Preload the radix tree so we can insert safely under the
+ * write spinlock.
*/
- new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
if (radix_tree_preload(GFP_KERNEL)) {
xfs_idestroy(ip);
delay(1);
@@ -242,17 +239,6 @@ finish_inode:
mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
first_index = agino & mask;
write_lock(&pag->pag_ici_lock);
-
- /*
- * Find the cluster if it exists
- */
- icl = NULL;
- if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
- first_index, 1)) {
- if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
- icl = iq->i_cluster;
- }
-
/*
* insert the new inode
*/
@@ -267,30 +253,13 @@ finish_inode:
}
/*
- * These values _must_ be set before releasing ihlock!
+ * These values _must_ be set before releasing the radix tree lock!
*/
ip->i_udquot = ip->i_gdquot = NULL;
xfs_iflags_set(ip, XFS_INEW);
- ASSERT(ip->i_cluster == NULL);
-
- if (!icl) {
- spin_lock_init(&new_icl->icl_lock);
- INIT_HLIST_HEAD(&new_icl->icl_inodes);
- icl = new_icl;
- new_icl = NULL;
- } else {
- ASSERT(!hlist_empty(&icl->icl_inodes));
- }
- spin_lock(&icl->icl_lock);
- hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
- ip->i_cluster = icl;
- spin_unlock(&icl->icl_lock);
-
write_unlock(&pag->pag_ici_lock);
radix_tree_preload_end();
- if (new_icl)
- kmem_zone_free(xfs_icluster_zone, new_icl);
/*
* Link ip to its mount and thread it on the mount's inode list.
@@ -529,18 +498,6 @@ xfs_iextract(
xfs_put_perag(mp, pag);
/*
- * Remove from cluster list
- */
- mp = ip->i_mount;
- spin_lock(&ip->i_cluster->icl_lock);
- hlist_del(&ip->i_cnode);
- spin_unlock(&ip->i_cluster->icl_lock);
-
- /* was last inode in cluster? */
- if (hlist_empty(&ip->i_cluster->icl_inodes))
- kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
-
- /*
* Remove from mount's inode list.
*/
XFS_MOUNT_ILOCK(mp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f43a6e01d68..ca12acb9039 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -55,7 +55,6 @@
kmem_zone_t *xfs_ifork_zone;
kmem_zone_t *xfs_inode_zone;
-kmem_zone_t *xfs_icluster_zone;
/*
* Used in xfs_itruncate(). This is the maximum number of extents
@@ -126,6 +125,90 @@ xfs_inobp_check(
#endif
/*
+ * Find the buffer associated with the given inode map
+ * We do basic validation checks on the buffer once it has been
+ * retrieved from disk.
+ */
+STATIC int
+xfs_imap_to_bp(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_imap_t *imap,
+ xfs_buf_t **bpp,
+ uint buf_flags,
+ uint imap_flags)
+{
+ int error;
+ int i;
+ int ni;
+ xfs_buf_t *bp;
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+ (int)imap->im_len, buf_flags, &bp);
+ if (error) {
+ if (error != EAGAIN) {
+ cmn_err(CE_WARN,
+ "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+ "an error %d on %s. Returning error.",
+ error, mp->m_fsname);
+ } else {
+ ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+ }
+ return error;
+ }
+
+ /*
+ * Validate the magic number and version of every inode in the buffer
+ * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+ */
+#ifdef DEBUG
+ ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
+#else /* usual case */
+ ni = 1;
+#endif
+
+ for (i = 0; i < ni; i++) {
+ int di_ok;
+ xfs_dinode_t *dip;
+
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+ (i << mp->m_sb.sb_inodelog));
+ di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+ XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+ if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+ XFS_ERRTAG_ITOBP_INOTOBP,
+ XFS_RANDOM_ITOBP_INOTOBP))) {
+ if (imap_flags & XFS_IMAP_BULKSTAT) {
+ xfs_trans_brelse(tp, bp);
+ return XFS_ERROR(EINVAL);
+ }
+ XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
+ XFS_ERRLEVEL_HIGH, mp, dip);
+#ifdef DEBUG
+ cmn_err(CE_PANIC,
+ "Device %s - bad inode magic/vsn "
+ "daddr %lld #%d (magic=%x)",
+ XFS_BUFTARG_NAME(mp->m_ddev_targp),
+ (unsigned long long)imap->im_blkno, i,
+ be16_to_cpu(dip->di_core.di_magic));
+#endif
+ xfs_trans_brelse(tp, bp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ }
+
+ xfs_inobp_check(mp, bp);
+
+ /*
+ * Mark the buffer as an inode buffer now that it looks good
+ */
+ XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+
+ *bpp = bp;
+ return 0;
+}
+
+/*
* This routine is called to map an inode number within a file
* system to the buffer containing the on-disk version of the
* inode. It returns a pointer to the buffer containing the
@@ -147,72 +230,19 @@ xfs_inotobp(
xfs_buf_t **bpp,
int *offset)
{
- int di_ok;
xfs_imap_t imap;
xfs_buf_t *bp;
int error;
- xfs_dinode_t *dip;
- /*
- * Call the space management code to find the location of the
- * inode on disk.
- */
imap.im_blkno = 0;
error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
- if (error != 0) {
- cmn_err(CE_WARN,
- "xfs_inotobp: xfs_imap() returned an "
- "error %d on %s. Returning error.", error, mp->m_fsname);
+ if (error)
return error;
- }
- /*
- * If the inode number maps to a block outside the bounds of the
- * file system then return NULL rather than calling read_buf
- * and panicing when we get an error from the driver.
- */
- if ((imap.im_blkno + imap.im_len) >
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
- cmn_err(CE_WARN,
- "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds "
- "of the file system %s. Returning EINVAL.",
- (unsigned long long)imap.im_blkno,
- imap.im_len, mp->m_fsname);
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will
- * default to just a read_buf() call.
- */
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
- (int)imap.im_len, XFS_BUF_LOCK, &bp);
-
- if (error) {
- cmn_err(CE_WARN,
- "xfs_inotobp: xfs_trans_read_buf() returned an "
- "error %d on %s. Returning error.", error, mp->m_fsname);
+ error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+ if (error)
return error;
- }
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
- di_ok =
- be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
- XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
- if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
- XFS_RANDOM_ITOBP_INOTOBP))) {
- XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
- xfs_trans_brelse(tp, bp);
- cmn_err(CE_WARN,
- "xfs_inotobp: XFS_TEST_ERROR() returned an "
- "error on %s. Returning EFSCORRUPTED.", mp->m_fsname);
- return XFS_ERROR(EFSCORRUPTED);
- }
- xfs_inobp_check(mp, bp);
-
- /*
- * Set *dipp to point to the on-disk inode in the buffer.
- */
*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
*bpp = bp;
*offset = imap.im_boffset;
@@ -248,46 +278,21 @@ xfs_itobp(
xfs_dinode_t **dipp,
xfs_buf_t **bpp,
xfs_daddr_t bno,
- uint imap_flags)
+ uint imap_flags,
+ uint buf_flags)
{
xfs_imap_t imap;
xfs_buf_t *bp;
int error;
- int i;
- int ni;
if (ip->i_blkno == (xfs_daddr_t)0) {
- /*
- * Call the space management code to find the location of the
- * inode on disk.
- */
imap.im_blkno = bno;
- if ((error = xfs_imap(mp, tp, ip->i_ino, &imap,
- XFS_IMAP_LOOKUP | imap_flags)))
+ error = xfs_imap(mp, tp, ip->i_ino, &imap,
+ XFS_IMAP_LOOKUP | imap_flags);
+ if (error)
return error;
/*
- * If the inode number maps to a block outside the bounds
- * of the file system then return NULL rather than calling
- * read_buf and panicing when we get an error from the
- * driver.
- */
- if ((imap.im_blkno + imap.im_len) >
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
- "(imap.im_blkno (0x%llx) "
- "+ imap.im_len (0x%llx)) > "
- " XFS_FSB_TO_BB(mp, "
- "mp->m_sb.sb_dblocks) (0x%llx)",
- (unsigned long long) imap.im_blkno,
- (unsigned long long) imap.im_len,
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-#endif /* DEBUG */
- return XFS_ERROR(EINVAL);
- }
-
- /*
* Fill in the fields in the inode that will be used to
* map the inode to its buffer from now on.
*/
@@ -305,76 +310,17 @@ xfs_itobp(
}
ASSERT(bno == 0 || bno == imap.im_blkno);
- /*
- * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will
- * default to just a read_buf() call.
- */
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
- (int)imap.im_len, XFS_BUF_LOCK, &bp);
- if (error) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
- "xfs_trans_read_buf() returned error %d, "
- "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
- error, (unsigned long long) imap.im_blkno,
- (unsigned long long) imap.im_len);
-#endif /* DEBUG */
+ error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
+ if (error)
return error;
- }
-
- /*
- * Validate the magic number and version of every inode in the buffer
- * (if DEBUG kernel) or the first inode in the buffer, otherwise.
- * No validation is done here in userspace (xfs_repair).
- */
-#if !defined(__KERNEL__)
- ni = 0;
-#elif defined(DEBUG)
- ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
-#else /* usual case */
- ni = 1;
-#endif
-
- for (i = 0; i < ni; i++) {
- int di_ok;
- xfs_dinode_t *dip;
- dip = (xfs_dinode_t *)xfs_buf_offset(bp,
- (i << mp->m_sb.sb_inodelog));
- di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
- XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
- if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
- XFS_ERRTAG_ITOBP_INOTOBP,
- XFS_RANDOM_ITOBP_INOTOBP))) {
- if (imap_flags & XFS_IMAP_BULKSTAT) {
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EINVAL);
- }
-#ifdef DEBUG
- cmn_err(CE_ALERT,
- "Device %s - bad inode magic/vsn "
- "daddr %lld #%d (magic=%x)",
- XFS_BUFTARG_NAME(mp->m_ddev_targp),
- (unsigned long long)imap.im_blkno, i,
- be16_to_cpu(dip->di_core.di_magic));
-#endif
- XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
- mp, dip);
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ if (!bp) {
+ ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+ ASSERT(tp == NULL);
+ *bpp = NULL;
+ return EAGAIN;
}
- xfs_inobp_check(mp, bp);
-
- /*
- * Mark the buffer as an inode buffer now that it looks good
- */
- XFS_BUF_SET_VTYPE(bp, B_FS_INO);
-
- /*
- * Set *dipp to point to the on-disk inode in the buffer.
- */
*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
*bpp = bp;
return 0;
@@ -878,7 +824,7 @@ xfs_iread(
* return NULL as well. Set i_blkno to 0 so that xfs_itobp() will
* know that this is a new incore inode.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
+ error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
if (error) {
kmem_zone_free(xfs_inode_zone, ip);
return error;
@@ -1518,51 +1464,50 @@ xfs_itruncate_start(
}
/*
- * Shrink the file to the given new_size. The new
- * size must be smaller than the current size.
- * This will free up the underlying blocks
- * in the removed range after a call to xfs_itruncate_start()
- * or xfs_atruncate_start().
+ * Shrink the file to the given new_size. The new size must be smaller than
+ * the current size. This will free up the underlying blocks in the removed
+ * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
*
- * The transaction passed to this routine must have made
- * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
- * This routine may commit the given transaction and
- * start new ones, so make sure everything involved in
- * the transaction is tidy before calling here.
- * Some transaction will be returned to the caller to be
- * committed. The incoming transaction must already include
- * the inode, and both inode locks must be held exclusively.
- * The inode must also be "held" within the transaction. On
- * return the inode will be "held" within the returned transaction.
- * This routine does NOT require any disk space to be reserved
- * for it within the transaction.
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here. Some transaction will be
+ * returned to the caller to be committed. The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction. On return the inode
+ * will be "held" within the returned transaction. This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
*
- * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
- * and it indicates the fork which is to be truncated. For the
- * attribute fork we only support truncation to size 0.
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
+ * indicates the fork which is to be truncated. For the attribute fork we only
+ * support truncation to size 0.
*
- * We use the sync parameter to indicate whether or not the first
- * transaction we perform might have to be synchronous. For the attr fork,
- * it needs to be so if the unlink of the inode is not yet known to be
- * permanent in the log. This keeps us from freeing and reusing the
- * blocks of the attribute fork before the unlink of the inode becomes
- * permanent.
+ * We use the sync parameter to indicate whether or not the first transaction
+ * we perform might have to be synchronous. For the attr fork, it needs to be
+ * so if the unlink of the inode is not yet known to be permanent in the log.
+ * This keeps us from freeing and reusing the blocks of the attribute fork
+ * before the unlink of the inode becomes permanent.
*
- * For the data fork, we normally have to run synchronously if we're
- * being called out of the inactive path or we're being called
- * out of the create path where we're truncating an existing file.
- * Either way, the truncate needs to be sync so blocks don't reappear
- * in the file with altered data in case of a crash. wsync filesystems
- * can run the first case async because anything that shrinks the inode
- * has to run sync so by the time we're called here from inactive, the
- * inode size is permanently set to 0.
+ * For the data fork, we normally have to run synchronously if we're being
+ * called out of the inactive path or we're being called out of the create path
+ * where we're truncating an existing file. Either way, the truncate needs to
+ * be sync so blocks don't reappear in the file with altered data in case of a
+ * crash. wsync filesystems can run the first case async because anything that
+ * shrinks the inode has to run sync so by the time we're called here from
+ * inactive, the inode size is permanently set to 0.
*
- * Calls from the truncate path always need to be sync unless we're
- * in a wsync filesystem and the file has already been unlinked.
+ * Calls from the truncate path always need to be sync unless we're in a wsync
+ * filesystem and the file has already been unlinked.
*
- * The caller is responsible for correctly setting the sync parameter.
- * It gets too hard for us to guess here which path we're being called
- * out of just based on inode state.
+ * The caller is responsible for correctly setting the sync parameter. It gets
+ * too hard for us to guess here which path we're being called out of just
+ * based on inode state.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not. We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
*/
int
xfs_itruncate_finish(
@@ -1741,65 +1686,51 @@ xfs_itruncate_finish(
*/
error = xfs_bmap_finish(tp, &free_list, &committed);
ntp = *tp;
+ if (committed) {
+ /* link the inode into the next xact in the chain */
+ xfs_trans_ijoin(ntp, ip,
+ XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ihold(ntp, ip);
+ }
+
if (error) {
/*
- * If the bmap finish call encounters an error,
- * return to the caller where the transaction
- * can be properly aborted. We just need to
- * make sure we're not holding any resources
- * that we were not when we came in.
+ * If the bmap finish call encounters an error, return
+ * to the caller where the transaction can be properly
+ * aborted. We just need to make sure we're not
+ * holding any resources that we were not when we came
+ * in.
*
- * Aborting from this point might lose some
- * blocks in the file system, but oh well.
+ * Aborting from this point might lose some blocks in
+ * the file system, but oh well.
*/
xfs_bmap_cancel(&free_list);
- if (committed) {
- /*
- * If the passed in transaction committed
- * in xfs_bmap_finish(), then we want to
- * add the inode to this one before returning.
- * This keeps things simple for the higher
- * level code, because it always knows that
- * the inode is locked and held in the
- * transaction that returns to it whether
- * errors occur or not. We don't mark the
- * inode dirty so that this transaction can
- * be easily aborted if possible.
- */
- xfs_trans_ijoin(ntp, ip,
- XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(ntp, ip);
- }
return error;
}
if (committed) {
/*
- * The first xact was committed,
- * so add the inode to the new one.
- * Mark it dirty so it will be logged
- * and moved forward in the log as
- * part of every commit.
+ * Mark the inode dirty so it will be logged and
+ * moved forward in the log as part of every commit.
*/
- xfs_trans_ijoin(ntp, ip,
- XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(ntp, ip);
xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
}
+
ntp = xfs_trans_dup(ntp);
- (void) xfs_trans_commit(*tp, 0);
+ error = xfs_trans_commit(*tp, 0);
*tp = ntp;
- error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_ITRUNCATE_LOG_COUNT);
- /*
- * Add the inode being truncated to the next chained
- * transaction.
- */
+
+ /* link the inode into the next transaction in the chain */
xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
xfs_trans_ihold(ntp, ip);
+
+ if (!error)
+ error = xfs_trans_reserve(ntp, 0,
+ XFS_ITRUNCATE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_ITRUNCATE_LOG_COUNT);
if (error)
- return (error);
+ return error;
}
/*
* Only update the size in the case of the data fork, but
@@ -1967,7 +1898,7 @@ xfs_iunlink(
* Here we put the head pointer into our next pointer,
* and then we fall through to point the head at us.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
if (error)
return error;
@@ -2075,7 +2006,7 @@ xfs_iunlink_remove(
* of dealing with the buffer when there is no need to
* change it.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
if (error) {
cmn_err(CE_WARN,
"xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -2137,7 +2068,7 @@ xfs_iunlink_remove(
* Now last_ibp points to the buffer previous to us on
* the unlinked list. Pull us from the list.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
if (error) {
cmn_err(CE_WARN,
"xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -2172,13 +2103,6 @@ xfs_iunlink_remove(
return 0;
}
-STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip)
-{
- return (((ip->i_itemp == NULL) ||
- !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
- (ip->i_update_core == 0));
-}
-
STATIC void
xfs_ifree_cluster(
xfs_inode_t *free_ip,
@@ -2400,7 +2324,7 @@ xfs_ifree(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0);
+ error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
if (error)
return error;
@@ -2678,14 +2602,31 @@ xfs_imap(
fsbno = imap->im_blkno ?
XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
- if (error != 0) {
+ if (error)
return error;
- }
+
imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
imap->im_len = XFS_FSB_TO_BB(mp, len);
imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
imap->im_ioffset = (ushort)off;
imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
+
+ /*
+ * If the inode number maps to a block outside the bounds
+ * of the file system then return NULL rather than calling
+ * read_buf and panicing when we get an error from the
+ * driver.
+ */
+ if ((imap->im_blkno + imap->im_len) >
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+ xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+ "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+ " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+ (unsigned long long) imap->im_blkno,
+ (unsigned long long) imap->im_len,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+ return EINVAL;
+ }
return 0;
}
@@ -2826,38 +2767,41 @@ xfs_iunpin(
}
/*
- * This is called to wait for the given inode to be unpinned.
- * It will sleep until this happens. The caller must have the
- * inode locked in at least shared mode so that the buffer cannot
- * be subsequently pinned once someone is waiting for it to be
- * unpinned.
+ * This is called to unpin an inode. It can be directed to wait or to return
+ * immediately without waiting for the inode to be unpinned. The caller must
+ * have the inode locked in at least shared mode so that the buffer cannot be
+ * subsequently pinned once someone is waiting for it to be unpinned.
*/
STATIC void
-xfs_iunpin_wait(
- xfs_inode_t *ip)
+__xfs_iunpin_wait(
+ xfs_inode_t *ip,
+ int wait)
{
- xfs_inode_log_item_t *iip;
- xfs_lsn_t lsn;
+ xfs_inode_log_item_t *iip = ip->i_itemp;
ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
-
- if (atomic_read(&ip->i_pincount) == 0) {
+ if (atomic_read(&ip->i_pincount) == 0)
return;
- }
- iip = ip->i_itemp;
- if (iip && iip->ili_last_lsn) {
- lsn = iip->ili_last_lsn;
- } else {
- lsn = (xfs_lsn_t)0;
- }
+ /* Give the log a push to start the unpinning I/O */
+ xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
+ iip->ili_last_lsn : 0, XFS_LOG_FORCE);
+ if (wait)
+ wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+}
- /*
- * Give the log a push so we don't wait here too long.
- */
- xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
+static inline void
+xfs_iunpin_wait(
+ xfs_inode_t *ip)
+{
+ __xfs_iunpin_wait(ip, 1);
+}
- wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+static inline void
+xfs_iunpin_nowait(
+ xfs_inode_t *ip)
+{
+ __xfs_iunpin_wait(ip, 0);
}
@@ -2932,7 +2876,7 @@ xfs_iextents_copy(
* format indicates the current state of the fork.
*/
/*ARGSUSED*/
-STATIC int
+STATIC void
xfs_iflush_fork(
xfs_inode_t *ip,
xfs_dinode_t *dip,
@@ -2953,16 +2897,16 @@ xfs_iflush_fork(
static const short extflag[2] =
{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
- if (iip == NULL)
- return 0;
+ if (!iip)
+ return;
ifp = XFS_IFORK_PTR(ip, whichfork);
/*
* This can happen if we gave up in iformat in an error path,
* for the attribute fork.
*/
- if (ifp == NULL) {
+ if (!ifp) {
ASSERT(whichfork == XFS_ATTR_FORK);
- return 0;
+ return;
}
cp = XFS_DFORK_PTR(dip, whichfork);
mp = ip->i_mount;
@@ -3023,8 +2967,145 @@ xfs_iflush_fork(
ASSERT(0);
break;
}
+}
+
+STATIC int
+xfs_iflush_cluster(
+ xfs_inode_t *ip,
+ xfs_buf_t *bp)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
+ unsigned long first_index, mask;
+ int ilist_size;
+ xfs_inode_t **ilist;
+ xfs_inode_t *iq;
+ int nr_found;
+ int clcount = 0;
+ int bufwasdelwri;
+ int i;
+
+ ASSERT(pag->pagi_inodeok);
+ ASSERT(pag->pag_ici_init);
+
+ ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
+ ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
+ if (!ilist)
+ return 0;
+
+ mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
+ read_lock(&pag->pag_ici_lock);
+ /* really need a gang lookup range call here */
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+ first_index,
+ XFS_INODE_CLUSTER_SIZE(mp));
+ if (nr_found == 0)
+ goto out_free;
+
+ for (i = 0; i < nr_found; i++) {
+ iq = ilist[i];
+ if (iq == ip)
+ continue;
+ /* if the inode lies outside this cluster, we're done. */
+ if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+ break;
+ /*
+ * Do an un-protected check to see if the inode is dirty and
+ * is a candidate for flushing. These checks will be repeated
+ * later after the appropriate locks are acquired.
+ */
+ if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+ continue;
+
+ /*
+ * Try to get locks. If any are unavailable or it is pinned,
+ * then this inode cannot be flushed and is skipped.
+ */
+
+ if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+ continue;
+ if (!xfs_iflock_nowait(iq)) {
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ continue;
+ }
+ if (xfs_ipincount(iq)) {
+ xfs_ifunlock(iq);
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ continue;
+ }
+
+ /*
+ * arriving here means that this inode can be flushed. First
+ * re-check that it's dirty before flushing.
+ */
+ if (!xfs_inode_clean(iq)) {
+ int error;
+ error = xfs_iflush_int(iq, bp);
+ if (error) {
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ goto cluster_corrupt_out;
+ }
+ clcount++;
+ } else {
+ xfs_ifunlock(iq);
+ }
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ }
+
+ if (clcount) {
+ XFS_STATS_INC(xs_icluster_flushcnt);
+ XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+ }
+out_free:
+ read_unlock(&pag->pag_ici_lock);
+ kmem_free(ilist, ilist_size);
return 0;
+
+
+cluster_corrupt_out:
+ /*
+ * Corruption detected in the clustering loop. Invalidate the
+ * inode buffer and shut down the filesystem.
+ */
+ read_unlock(&pag->pag_ici_lock);
+ /*
+ * Clean up the buffer. If it was B_DELWRI, just release it --
+ * brelse can handle it with no problems. If not, shut down the
+ * filesystem before releasing the buffer.
+ */
+ bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
+ if (bufwasdelwri)
+ xfs_buf_relse(bp);
+
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+
+ if (!bufwasdelwri) {
+ /*
+ * Just like incore_relse: if we have b_iodone functions,
+ * mark the buffer as an error and call them. Otherwise
+ * mark it as stale and brelse.
+ */
+ if (XFS_BUF_IODONE_FUNC(bp)) {
+ XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_STALE(bp);
+ XFS_BUF_SHUT(bp);
+ XFS_BUF_ERROR(bp,EIO);
+ xfs_biodone(bp);
+ } else {
+ XFS_BUF_STALE(bp);
+ xfs_buf_relse(bp);
+ }
+ }
+
+ /*
+ * Unlocks the flush lock
+ */
+ xfs_iflush_abort(iq);
+ kmem_free(ilist, ilist_size);
+ return XFS_ERROR(EFSCORRUPTED);
}
/*
@@ -3046,11 +3127,7 @@ xfs_iflush(
xfs_dinode_t *dip;
xfs_mount_t *mp;
int error;
- /* REFERENCED */
- xfs_inode_t *iq;
- int clcount; /* count of inodes clustered */
- int bufwasdelwri;
- struct hlist_node *entry;
+ int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
XFS_STATS_INC(xs_iflush_count);
@@ -3067,8 +3144,7 @@ xfs_iflush(
* If the inode isn't dirty, then just release the inode
* flush lock and do nothing.
*/
- if ((ip->i_update_core == 0) &&
- ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+ if (xfs_inode_clean(ip)) {
ASSERT((iip != NULL) ?
!(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
xfs_ifunlock(ip);
@@ -3076,11 +3152,21 @@ xfs_iflush(
}
/*
- * We can't flush the inode until it is unpinned, so
- * wait for it. We know noone new can pin it, because
- * we are holding the inode lock shared and you need
- * to hold it exclusively to pin the inode.
+ * We can't flush the inode until it is unpinned, so wait for it if we
+ * are allowed to block. We know noone new can pin it, because we are
+ * holding the inode lock shared and you need to hold it exclusively to
+ * pin the inode.
+ *
+ * If we are not allowed to block, force the log out asynchronously so
+ * that when we come back the inode will be unpinned. If other inodes
+ * in the same cluster are dirty, they will probably write the inode
+ * out for us if they occur after the log force completes.
*/
+ if (noblock && xfs_ipincount(ip)) {
+ xfs_iunpin_nowait(ip);
+ xfs_ifunlock(ip);
+ return EAGAIN;
+ }
xfs_iunpin_wait(ip);
/*
@@ -3097,15 +3183,6 @@ xfs_iflush(
}
/*
- * Get the buffer containing the on-disk inode.
- */
- error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
- if (error) {
- xfs_ifunlock(ip);
- return error;
- }
-
- /*
* Decide how buffer will be flushed out. This is done before
* the call to xfs_iflush_int because this field is zeroed by it.
*/
@@ -3121,6 +3198,7 @@ xfs_iflush(
case XFS_IFLUSH_DELWRI_ELSE_SYNC:
flags = 0;
break;
+ case XFS_IFLUSH_ASYNC_NOBLOCK:
case XFS_IFLUSH_ASYNC:
case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
flags = INT_ASYNC;
@@ -3140,6 +3218,7 @@ xfs_iflush(
case XFS_IFLUSH_DELWRI:
flags = INT_DELWRI;
break;
+ case XFS_IFLUSH_ASYNC_NOBLOCK:
case XFS_IFLUSH_ASYNC:
flags = INT_ASYNC;
break;
@@ -3154,94 +3233,41 @@ xfs_iflush(
}
/*
- * First flush out the inode that xfs_iflush was called with.
+ * Get the buffer containing the on-disk inode.
*/
- error = xfs_iflush_int(ip, bp);
- if (error) {
- goto corrupt_out;
+ error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
+ noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
+ if (error || !bp) {
+ xfs_ifunlock(ip);
+ return error;
}
/*
- * inode clustering:
- * see if other inodes can be gathered into this write
+ * First flush out the inode that xfs_iflush was called with.
*/
- spin_lock(&ip->i_cluster->icl_lock);
- ip->i_cluster->icl_buf = bp;
-
- clcount = 0;
- hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
- if (iq == ip)
- continue;
-
- /*
- * Do an un-protected check to see if the inode is dirty and
- * is a candidate for flushing. These checks will be repeated
- * later after the appropriate locks are acquired.
- */
- iip = iq->i_itemp;
- if ((iq->i_update_core == 0) &&
- ((iip == NULL) ||
- !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
- xfs_ipincount(iq) == 0) {
- continue;
- }
-
- /*
- * Try to get locks. If any are unavailable,
- * then this inode cannot be flushed and is skipped.
- */
-
- /* get inode locks (just i_lock) */
- if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
- /* get inode flush lock */
- if (xfs_iflock_nowait(iq)) {
- /* check if pinned */
- if (xfs_ipincount(iq) == 0) {
- /* arriving here means that
- * this inode can be flushed.
- * first re-check that it's
- * dirty
- */
- iip = iq->i_itemp;
- if ((iq->i_update_core != 0)||
- ((iip != NULL) &&
- (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
- clcount++;
- error = xfs_iflush_int(iq, bp);
- if (error) {
- xfs_iunlock(iq,
- XFS_ILOCK_SHARED);
- goto cluster_corrupt_out;
- }
- } else {
- xfs_ifunlock(iq);
- }
- } else {
- xfs_ifunlock(iq);
- }
- }
- xfs_iunlock(iq, XFS_ILOCK_SHARED);
- }
- }
- spin_unlock(&ip->i_cluster->icl_lock);
-
- if (clcount) {
- XFS_STATS_INC(xs_icluster_flushcnt);
- XFS_STATS_ADD(xs_icluster_flushinode, clcount);
- }
+ error = xfs_iflush_int(ip, bp);
+ if (error)
+ goto corrupt_out;
/*
- * If the buffer is pinned then push on the log so we won't
+ * If the buffer is pinned then push on the log now so we won't
* get stuck waiting in the write for too long.
*/
- if (XFS_BUF_ISPINNED(bp)){
+ if (XFS_BUF_ISPINNED(bp))
xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
- }
+
+ /*
+ * inode clustering:
+ * see if other inodes can be gathered into this write
+ */
+ error = xfs_iflush_cluster(ip, bp);
+ if (error)
+ goto cluster_corrupt_out;
if (flags & INT_DELWRI) {
xfs_bdwrite(mp, bp);
} else if (flags & INT_ASYNC) {
- xfs_bawrite(mp, bp);
+ error = xfs_bawrite(mp, bp);
} else {
error = xfs_bwrite(mp, bp);
}
@@ -3250,52 +3276,11 @@ xfs_iflush(
corrupt_out:
xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- xfs_iflush_abort(ip);
- /*
- * Unlocks the flush lock
- */
- return XFS_ERROR(EFSCORRUPTED);
-
cluster_corrupt_out:
- /* Corruption detected in the clustering loop. Invalidate the
- * inode buffer and shut down the filesystem.
- */
- spin_unlock(&ip->i_cluster->icl_lock);
-
- /*
- * Clean up the buffer. If it was B_DELWRI, just release it --
- * brelse can handle it with no problems. If not, shut down the
- * filesystem before releasing the buffer.
- */
- if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
- xfs_buf_relse(bp);
- }
-
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-
- if(!bufwasdelwri) {
- /*
- * Just like incore_relse: if we have b_iodone functions,
- * mark the buffer as an error and call them. Otherwise
- * mark it as stale and brelse.
- */
- if (XFS_BUF_IODONE_FUNC(bp)) {
- XFS_BUF_CLR_BDSTRAT_FUNC(bp);
- XFS_BUF_UNDONE(bp);
- XFS_BUF_STALE(bp);
- XFS_BUF_SHUT(bp);
- XFS_BUF_ERROR(bp,EIO);
- xfs_biodone(bp);
- } else {
- XFS_BUF_STALE(bp);
- xfs_buf_relse(bp);
- }
- }
-
- xfs_iflush_abort(iq);
/*
* Unlocks the flush lock
*/
+ xfs_iflush_abort(ip);
return XFS_ERROR(EFSCORRUPTED);
}
@@ -3325,8 +3310,7 @@ xfs_iflush_int(
* If the inode isn't dirty, then just release the inode
* flush lock and do nothing.
*/
- if ((ip->i_update_core == 0) &&
- ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+ if (xfs_inode_clean(ip)) {
xfs_ifunlock(ip);
return 0;
}
@@ -3459,16 +3443,9 @@ xfs_iflush_int(
}
}
- if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
- goto corrupt_out;
- }
-
- if (XFS_IFORK_Q(ip)) {
- /*
- * The only error from xfs_iflush_fork is on the data fork.
- */
- (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
- }
+ xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
+ if (XFS_IFORK_Q(ip))
+ xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
xfs_inobp_check(mp, bp);
/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index bfcd72cbaee..93c37697a72 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -133,19 +133,6 @@ typedef struct dm_attrs_s {
} dm_attrs_t;
/*
- * This is the xfs inode cluster structure. This structure is used by
- * xfs_iflush to find inodes that share a cluster and can be flushed to disk at
- * the same time.
- */
-typedef struct xfs_icluster {
- struct hlist_head icl_inodes; /* list of inodes on cluster */
- xfs_daddr_t icl_blkno; /* starting block number of
- * the cluster */
- struct xfs_buf *icl_buf; /* the inode buffer */
- spinlock_t icl_lock; /* inode list lock */
-} xfs_icluster_t;
-
-/*
* This is the xfs in-core inode structure.
* Most of the on-disk inode is embedded in the i_d field.
*
@@ -240,10 +227,6 @@ typedef struct xfs_inode {
atomic_t i_pincount; /* inode pin count */
wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
spinlock_t i_flags_lock; /* inode i_flags lock */
-#ifdef HAVE_REFCACHE
- struct xfs_inode **i_refcache; /* ptr to entry in ref cache */
- struct xfs_inode *i_release; /* inode to unref */
-#endif
/* Miscellaneous state. */
unsigned short i_flags; /* see defined flags below */
unsigned char i_update_core; /* timestamps/size is dirty */
@@ -252,8 +235,6 @@ typedef struct xfs_inode {
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
- xfs_icluster_t *i_cluster; /* cluster list header */
- struct hlist_node i_cnode; /* cluster link node */
xfs_fsize_t i_size; /* in-memory size */
xfs_fsize_t i_new_size; /* size when write completes */
@@ -461,6 +442,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
#define XFS_IFLUSH_SYNC 3
#define XFS_IFLUSH_ASYNC 4
#define XFS_IFLUSH_DELWRI 5
+#define XFS_IFLUSH_ASYNC_NOBLOCK 6
/*
* Flags for xfs_itruncate_start().
@@ -515,7 +497,7 @@ int xfs_finish_reclaim_all(struct xfs_mount *, int);
*/
int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
- xfs_daddr_t, uint);
+ xfs_daddr_t, uint, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
xfs_inode_t **, xfs_daddr_t, uint);
int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
@@ -597,7 +579,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#define xfs_inobp_check(mp, bp)
#endif /* DEBUG */
-extern struct kmem_zone *xfs_icluster_zone;
extern struct kmem_zone *xfs_ifork_zone;
extern struct kmem_zone *xfs_inode_zone;
extern struct kmem_zone *xfs_ili_zone;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2c775b4ae9e..93b5db453ea 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -40,6 +40,7 @@
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_rw.h"
+#include "xfs_error.h"
kmem_zone_t *xfs_ili_zone; /* inode log item zone */
@@ -813,7 +814,12 @@ xfs_inode_item_pushbuf(
XFS_LOG_FORCE);
}
if (dopush) {
- xfs_bawrite(mp, bp);
+ int error;
+ error = xfs_bawrite(mp, bp);
+ if (error)
+ xfs_fs_cmn_err(CE_WARN, mp,
+ "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
+ error, iip, bp);
} else {
xfs_buf_relse(bp);
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index bfe92ea1795..40513077ab3 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -168,6 +168,14 @@ static inline int xfs_ilog_fext(int w)
return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
}
+static inline int xfs_inode_clean(xfs_inode_t *ip)
+{
+ return (!ip->i_itemp ||
+ !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+ !ip->i_update_core;
+}
+
+
#ifdef __KERNEL__
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index fde37f87d52..fb3cf119141 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,8 +802,11 @@ xfs_iomap_write_allocate(
*/
nimaps = 1;
end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
- xfs_bmap_last_offset(NULL, ip, &last_block,
- XFS_DATA_FORK);
+ error = xfs_bmap_last_offset(NULL, ip, &last_block,
+ XFS_DATA_FORK);
+ if (error)
+ goto trans_cancel;
+
last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
if ((map_start_fsb + count_fsb) > last_block) {
count_fsb = last_block - map_start_fsb;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f615e04364f..eb85bdedad0 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -129,7 +129,7 @@ xfs_bulkstat_one_iget(
return error;
}
-STATIC int
+STATIC void
xfs_bulkstat_one_dinode(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_ino_t ino, /* inode number to get data for */
@@ -198,8 +198,6 @@ xfs_bulkstat_one_dinode(
buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
break;
}
-
- return 0;
}
STATIC int
@@ -614,7 +612,8 @@ xfs_bulkstat(
xfs_buf_relse(bp);
error = xfs_itobp(mp, NULL, ip,
&dip, &bp, bno,
- XFS_IMAP_BULKSTAT);
+ XFS_IMAP_BULKSTAT,
+ XFS_BUF_LOCK);
if (!error)
clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
kmem_zone_free(xfs_inode_zone, ip);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 31f2b04f2c9..afaee301b0e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -41,6 +41,7 @@
#include "xfs_inode.h"
#include "xfs_rw.h"
+kmem_zone_t *xfs_log_ticket_zone;
#define xlog_write_adv_cnt(ptr, len, off, bytes) \
{ (ptr) += (bytes); \
@@ -73,8 +74,6 @@ STATIC int xlog_state_get_iclog_space(xlog_t *log,
xlog_ticket_t *ticket,
int *continued_write,
int *logoffsetp);
-STATIC void xlog_state_put_ticket(xlog_t *log,
- xlog_ticket_t *tic);
STATIC int xlog_state_release_iclog(xlog_t *log,
xlog_in_core_t *iclog);
STATIC void xlog_state_switch_iclogs(xlog_t *log,
@@ -101,7 +100,6 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
/* local ticket functions */
-STATIC void xlog_state_ticket_alloc(xlog_t *log);
STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log,
int unit_bytes,
int count,
@@ -330,7 +328,7 @@ xfs_log_done(xfs_mount_t *mp,
*/
xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
xlog_ungrant_log_space(log, ticket);
- xlog_state_put_ticket(log, ticket);
+ xlog_ticket_put(log, ticket);
} else {
xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
xlog_regrant_reserve_log_space(log, ticket);
@@ -384,7 +382,27 @@ _xfs_log_force(
return xlog_state_sync_all(log, flags, log_flushed);
else
return xlog_state_sync(log, lsn, flags, log_flushed);
-} /* xfs_log_force */
+} /* _xfs_log_force */
+
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+ xfs_mount_t *mp,
+ xfs_lsn_t lsn,
+ uint flags)
+{
+ int error;
+ error = _xfs_log_force(mp, lsn, flags, NULL);
+ if (error) {
+ xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+ "error %d returned.", error);
+ }
+}
+
/*
* Attaches a new iclog I/O completion callback routine during
@@ -397,12 +415,10 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
void *iclog_hndl, /* iclog to hang callback off */
xfs_log_callback_t *cb)
{
- xlog_t *log = mp->m_log;
xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
int abortflg;
- cb->cb_next = NULL;
- spin_lock(&log->l_icloglock);
+ spin_lock(&iclog->ic_callback_lock);
abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
if (!abortflg) {
ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
@@ -411,7 +427,7 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
*(iclog->ic_callback_tail) = cb;
iclog->ic_callback_tail = &(cb->cb_next);
}
- spin_unlock(&log->l_icloglock);
+ spin_unlock(&iclog->ic_callback_lock);
return abortflg;
} /* xfs_log_notify */
@@ -471,6 +487,8 @@ xfs_log_reserve(xfs_mount_t *mp,
/* may sleep if need to allocate more tickets */
internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
client, flags);
+ if (!internal_ticket)
+ return XFS_ERROR(ENOMEM);
internal_ticket->t_trans_type = t_type;
*ticket = internal_ticket;
xlog_trace_loggrant(log, internal_ticket,
@@ -636,7 +654,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
if (mp->m_flags & XFS_MOUNT_RDONLY)
return 0;
- xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+ error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL);
+ ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
#ifdef DEBUG
first_iclog = iclog = log->l_iclog;
@@ -675,10 +694,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- iclog->ic_refcnt++;
+ atomic_inc(&iclog->ic_refcnt);
spin_unlock(&log->l_icloglock);
xlog_state_want_sync(log, iclog);
- (void) xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog);
spin_lock(&log->l_icloglock);
if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
@@ -695,7 +714,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
if (tic) {
xlog_trace_loggrant(log, tic, "unmount rec");
xlog_ungrant_log_space(log, tic);
- xlog_state_put_ticket(log, tic);
+ xlog_ticket_put(log, tic);
}
} else {
/*
@@ -713,11 +732,11 @@ xfs_log_unmount_write(xfs_mount_t *mp)
*/
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- iclog->ic_refcnt++;
+ atomic_inc(&iclog->ic_refcnt);
spin_unlock(&log->l_icloglock);
xlog_state_want_sync(log, iclog);
- (void) xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog);
spin_lock(&log->l_icloglock);
@@ -732,7 +751,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
}
}
- return 0;
+ return error;
} /* xfs_log_unmount_write */
/*
@@ -1210,7 +1229,6 @@ xlog_alloc_log(xfs_mount_t *mp,
spin_lock_init(&log->l_icloglock);
spin_lock_init(&log->l_grant_lock);
initnsema(&log->l_flushsema, 0, "ic-flush");
- xlog_state_ticket_alloc(log); /* wait until after icloglock inited */
/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1240,9 +1258,9 @@ xlog_alloc_log(xfs_mount_t *mp,
XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
iclog->ic_bp = bp;
iclog->hic_data = bp->b_addr;
-
+#ifdef DEBUG
log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
-
+#endif
head = &iclog->ic_header;
memset(head, 0, sizeof(xlog_rec_header_t));
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1253,10 +1271,11 @@ xlog_alloc_log(xfs_mount_t *mp,
head->h_fmt = cpu_to_be32(XLOG_FMT);
memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
-
iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_log = log;
+ atomic_set(&iclog->ic_refcnt, 0);
+ spin_lock_init(&iclog->ic_callback_lock);
iclog->ic_callback_tail = &(iclog->ic_callback);
iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
@@ -1405,7 +1424,7 @@ xlog_sync(xlog_t *log,
int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
XFS_STATS_INC(xs_log_writes);
- ASSERT(iclog->ic_refcnt == 0);
+ ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
/* Add for LR header */
count_init = log->l_iclog_hsize + iclog->ic_offset;
@@ -1538,7 +1557,6 @@ STATIC void
xlog_dealloc_log(xlog_t *log)
{
xlog_in_core_t *iclog, *next_iclog;
- xlog_ticket_t *tic, *next_tic;
int i;
iclog = log->l_iclog;
@@ -1559,22 +1577,6 @@ xlog_dealloc_log(xlog_t *log)
spinlock_destroy(&log->l_icloglock);
spinlock_destroy(&log->l_grant_lock);
- /* XXXsup take a look at this again. */
- if ((log->l_ticket_cnt != log->l_ticket_tcnt) &&
- !XLOG_FORCED_SHUTDOWN(log)) {
- xfs_fs_cmn_err(CE_WARN, log->l_mp,
- "xlog_dealloc_log: (cnt: %d, total: %d)",
- log->l_ticket_cnt, log->l_ticket_tcnt);
- /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
-
- } else {
- tic = log->l_unmount_free;
- while (tic) {
- next_tic = tic->t_next;
- kmem_free(tic, PAGE_SIZE);
- tic = next_tic;
- }
- }
xfs_buf_free(log->l_xbuf);
#ifdef XFS_LOG_TRACE
if (log->l_trace != NULL) {
@@ -1987,7 +1989,7 @@ xlog_state_clean_log(xlog_t *log)
if (iclog->ic_state == XLOG_STATE_DIRTY) {
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_offset = 0;
- iclog->ic_callback = NULL; /* don't need to free */
+ ASSERT(iclog->ic_callback == NULL);
/*
* If the number of ops in this iclog indicate it just
* contains the dummy transaction, we can
@@ -2190,37 +2192,40 @@ xlog_state_do_callback(
be64_to_cpu(iclog->ic_header.h_lsn);
spin_unlock(&log->l_grant_lock);
- /*
- * Keep processing entries in the callback list
- * until we come around and it is empty. We
- * need to atomically see that the list is
- * empty and change the state to DIRTY so that
- * we don't miss any more callbacks being added.
- */
- spin_lock(&log->l_icloglock);
} else {
+ spin_unlock(&log->l_icloglock);
ioerrors++;
}
- cb = iclog->ic_callback;
+ /*
+ * Keep processing entries in the callback list until
+ * we come around and it is empty. We need to
+ * atomically see that the list is empty and change the
+ * state to DIRTY so that we don't miss any more
+ * callbacks being added.
+ */
+ spin_lock(&iclog->ic_callback_lock);
+ cb = iclog->ic_callback;
while (cb) {
iclog->ic_callback_tail = &(iclog->ic_callback);
iclog->ic_callback = NULL;
- spin_unlock(&log->l_icloglock);
+ spin_unlock(&iclog->ic_callback_lock);
/* perform callbacks in the order given */
for (; cb; cb = cb_next) {
cb_next = cb->cb_next;
cb->cb_func(cb->cb_arg, aborted);
}
- spin_lock(&log->l_icloglock);
+ spin_lock(&iclog->ic_callback_lock);
cb = iclog->ic_callback;
}
loopdidcallbacks++;
funcdidcallbacks++;
+ spin_lock(&log->l_icloglock);
ASSERT(iclog->ic_callback == NULL);
+ spin_unlock(&iclog->ic_callback_lock);
if (!(iclog->ic_state & XLOG_STATE_IOERROR))
iclog->ic_state = XLOG_STATE_DIRTY;
@@ -2241,7 +2246,7 @@ xlog_state_do_callback(
repeats = 0;
xfs_fs_cmn_err(CE_WARN, log->l_mp,
"%s: possible infinite loop (%d iterations)",
- __FUNCTION__, flushcnt);
+ __func__, flushcnt);
}
} while (!ioerrors && loopdidcallbacks);
@@ -2309,7 +2314,7 @@ xlog_state_done_syncing(
ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
iclog->ic_state == XLOG_STATE_IOERROR);
- ASSERT(iclog->ic_refcnt == 0);
+ ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
@@ -2391,7 +2396,7 @@ restart:
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
head = &iclog->ic_header;
- iclog->ic_refcnt++; /* prevents sync */
+ atomic_inc(&iclog->ic_refcnt); /* prevents sync */
log_offset = iclog->ic_offset;
/* On the 1st write to an iclog, figure out lsn. This works
@@ -2423,12 +2428,12 @@ restart:
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
/* If I'm the only one writing to this iclog, sync it to disk */
- if (iclog->ic_refcnt == 1) {
+ if (atomic_read(&iclog->ic_refcnt) == 1) {
spin_unlock(&log->l_icloglock);
if ((error = xlog_state_release_iclog(log, iclog)))
return error;
} else {
- iclog->ic_refcnt--;
+ atomic_dec(&iclog->ic_refcnt);
spin_unlock(&log->l_icloglock);
}
goto restart;
@@ -2792,18 +2797,6 @@ xlog_ungrant_log_space(xlog_t *log,
/*
- * Atomically put back used ticket.
- */
-STATIC void
-xlog_state_put_ticket(xlog_t *log,
- xlog_ticket_t *tic)
-{
- spin_lock(&log->l_icloglock);
- xlog_ticket_put(log, tic);
- spin_unlock(&log->l_icloglock);
-} /* xlog_state_put_ticket */
-
-/*
* Flush iclog to disk if this is the last reference to the given iclog and
* the WANT_SYNC bit is set.
*
@@ -2813,33 +2806,35 @@ xlog_state_put_ticket(xlog_t *log,
*
*/
STATIC int
-xlog_state_release_iclog(xlog_t *log,
- xlog_in_core_t *iclog)
+xlog_state_release_iclog(
+ xlog_t *log,
+ xlog_in_core_t *iclog)
{
int sync = 0; /* do we sync? */
- xlog_assign_tail_lsn(log->l_mp);
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ return XFS_ERROR(EIO);
- spin_lock(&log->l_icloglock);
+ ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
+ if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
+ return 0;
if (iclog->ic_state & XLOG_STATE_IOERROR) {
spin_unlock(&log->l_icloglock);
return XFS_ERROR(EIO);
}
-
- ASSERT(iclog->ic_refcnt > 0);
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
iclog->ic_state == XLOG_STATE_WANT_SYNC);
- if (--iclog->ic_refcnt == 0 &&
- iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ /* update tail before writing to iclog */
+ xlog_assign_tail_lsn(log->l_mp);
sync++;
iclog->ic_state = XLOG_STATE_SYNCING;
iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
/* cycle incremented when incrementing curr_block */
}
-
spin_unlock(&log->l_icloglock);
/*
@@ -2849,11 +2844,9 @@ xlog_state_release_iclog(xlog_t *log,
* this iclog has consistent data, so we ignore IOERROR
* flags after this point.
*/
- if (sync) {
+ if (sync)
return xlog_sync(log, iclog);
- }
return 0;
-
} /* xlog_state_release_iclog */
@@ -2953,7 +2946,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
* previous iclog and go to sleep.
*/
if (iclog->ic_state == XLOG_STATE_DIRTY ||
- (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
+ (atomic_read(&iclog->ic_refcnt) == 0
+ && iclog->ic_offset == 0)) {
iclog = iclog->ic_prev;
if (iclog->ic_state == XLOG_STATE_ACTIVE ||
iclog->ic_state == XLOG_STATE_DIRTY)
@@ -2961,14 +2955,14 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
else
goto maybe_sleep;
} else {
- if (iclog->ic_refcnt == 0) {
+ if (atomic_read(&iclog->ic_refcnt) == 0) {
/* We are the only one with access to this
* iclog. Flush it out now. There should
* be a roundoff of zero to show that someone
* has already taken care of the roundoff from
* the previous sync.
*/
- iclog->ic_refcnt++;
+ atomic_inc(&iclog->ic_refcnt);
lsn = be64_to_cpu(iclog->ic_header.h_lsn);
xlog_state_switch_iclogs(log, iclog, 0);
spin_unlock(&log->l_icloglock);
@@ -3100,7 +3094,7 @@ try_again:
already_slept = 1;
goto try_again;
} else {
- iclog->ic_refcnt++;
+ atomic_inc(&iclog->ic_refcnt);
xlog_state_switch_iclogs(log, iclog, 0);
spin_unlock(&log->l_icloglock);
if (xlog_state_release_iclog(log, iclog))
@@ -3172,92 +3166,19 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
*/
/*
- * Algorithm doesn't take into account page size. ;-(
- */
-STATIC void
-xlog_state_ticket_alloc(xlog_t *log)
-{
- xlog_ticket_t *t_list;
- xlog_ticket_t *next;
- xfs_caddr_t buf;
- uint i = (PAGE_SIZE / sizeof(xlog_ticket_t)) - 2;
-
- /*
- * The kmem_zalloc may sleep, so we shouldn't be holding the
- * global lock. XXXmiken: may want to use zone allocator.
- */
- buf = (xfs_caddr_t) kmem_zalloc(PAGE_SIZE, KM_SLEEP);
-
- spin_lock(&log->l_icloglock);
-
- /* Attach 1st ticket to Q, so we can keep track of allocated memory */
- t_list = (xlog_ticket_t *)buf;
- t_list->t_next = log->l_unmount_free;
- log->l_unmount_free = t_list++;
- log->l_ticket_cnt++;
- log->l_ticket_tcnt++;
-
- /* Next ticket becomes first ticket attached to ticket free list */
- if (log->l_freelist != NULL) {
- ASSERT(log->l_tail != NULL);
- log->l_tail->t_next = t_list;
- } else {
- log->l_freelist = t_list;
- }
- log->l_ticket_cnt++;
- log->l_ticket_tcnt++;
-
- /* Cycle through rest of alloc'ed memory, building up free Q */
- for ( ; i > 0; i--) {
- next = t_list + 1;
- t_list->t_next = next;
- t_list = next;
- log->l_ticket_cnt++;
- log->l_ticket_tcnt++;
- }
- t_list->t_next = NULL;
- log->l_tail = t_list;
- spin_unlock(&log->l_icloglock);
-} /* xlog_state_ticket_alloc */
-
-
-/*
- * Put ticket into free list
- *
- * Assumption: log lock is held around this call.
+ * Free a used ticket.
*/
STATIC void
xlog_ticket_put(xlog_t *log,
xlog_ticket_t *ticket)
{
sv_destroy(&ticket->t_sema);
-
- /*
- * Don't think caching will make that much difference. It's
- * more important to make debug easier.
- */
-#if 0
- /* real code will want to use LIFO for caching */
- ticket->t_next = log->l_freelist;
- log->l_freelist = ticket;
- /* no need to clear fields */
-#else
- /* When we debug, it is easier if tickets are cycled */
- ticket->t_next = NULL;
- if (log->l_tail) {
- log->l_tail->t_next = ticket;
- } else {
- ASSERT(log->l_freelist == NULL);
- log->l_freelist = ticket;
- }
- log->l_tail = ticket;
-#endif /* DEBUG */
- log->l_ticket_cnt++;
+ kmem_zone_free(xfs_log_ticket_zone, ticket);
} /* xlog_ticket_put */
/*
- * Grab ticket off freelist or allocation some more
+ * Allocate and initialise a new log ticket.
*/
STATIC xlog_ticket_t *
xlog_ticket_get(xlog_t *log,
@@ -3269,21 +3190,9 @@ xlog_ticket_get(xlog_t *log,
xlog_ticket_t *tic;
uint num_headers;
- alloc:
- if (log->l_freelist == NULL)
- xlog_state_ticket_alloc(log); /* potentially sleep */
-
- spin_lock(&log->l_icloglock);
- if (log->l_freelist == NULL) {
- spin_unlock(&log->l_icloglock);
- goto alloc;
- }
- tic = log->l_freelist;
- log->l_freelist = tic->t_next;
- if (log->l_freelist == NULL)
- log->l_tail = NULL;
- log->l_ticket_cnt--;
- spin_unlock(&log->l_icloglock);
+ tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+ if (!tic)
+ return NULL;
/*
* Permanent reservations have up to 'cnt'-1 active log operations
@@ -3611,8 +3520,8 @@ xfs_log_force_umount(
* before we mark the filesystem SHUTDOWN and wake
* everybody up to tell the bad news.
*/
- spin_lock(&log->l_grant_lock);
spin_lock(&log->l_icloglock);
+ spin_lock(&log->l_grant_lock);
mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
XFS_BUF_DONE(mp->m_sb_bp);
/*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 4cdac048df5..d1d678ecb63 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -142,8 +142,9 @@ int _xfs_log_force(struct xfs_mount *mp,
xfs_lsn_t lsn,
uint flags,
int *log_forced);
-#define xfs_log_force(mp, lsn, flags) \
- _xfs_log_force(mp, lsn, flags, NULL);
+void xfs_log_force(struct xfs_mount *mp,
+ xfs_lsn_t lsn,
+ uint flags);
int xfs_log_mount(struct xfs_mount *mp,
struct xfs_buftarg *log_target,
xfs_daddr_t start_block,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c6244cc733c..8952a392b5f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -242,7 +242,7 @@ typedef struct xlog_res {
typedef struct xlog_ticket {
sv_t t_sema; /* sleep on this semaphore : 20 */
- struct xlog_ticket *t_next; /* :4|8 */
+ struct xlog_ticket *t_next; /* :4|8 */
struct xlog_ticket *t_prev; /* :4|8 */
xlog_tid_t t_tid; /* transaction identifier : 4 */
int t_curr_res; /* current reservation in bytes : 4 */
@@ -324,6 +324,19 @@ typedef struct xlog_rec_ext_header {
* - ic_offset is the current number of bytes written to in this iclog.
* - ic_refcnt is bumped when someone is writing to the log.
* - ic_state is the state of the iclog.
+ *
+ * Because of cacheline contention on large machines, we need to separate
+ * various resources onto different cachelines. To start with, make the
+ * structure cacheline aligned. The following fields can be contended on
+ * by independent processes:
+ *
+ * - ic_callback_*
+ * - ic_refcnt
+ * - fields protected by the global l_icloglock
+ *
+ * so we need to ensure that these fields are located in separate cachelines.
+ * We'll put all the read-only and l_icloglock fields in the first cacheline,
+ * and move everything else out to subsequent cachelines.
*/
typedef struct xlog_iclog_fields {
sv_t ic_forcesema;
@@ -332,17 +345,22 @@ typedef struct xlog_iclog_fields {
struct xlog_in_core *ic_prev;
struct xfs_buf *ic_bp;
struct log *ic_log;
- xfs_log_callback_t *ic_callback;
- xfs_log_callback_t **ic_callback_tail;
-#ifdef XFS_LOG_TRACE
- struct ktrace *ic_trace;
-#endif
int ic_size;
int ic_offset;
- int ic_refcnt;
int ic_bwritecnt;
ushort_t ic_state;
char *ic_datap; /* pointer to iclog data */
+#ifdef XFS_LOG_TRACE
+ struct ktrace *ic_trace;
+#endif
+
+ /* Callback structures need their own cacheline */
+ spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
+ xfs_log_callback_t *ic_callback;
+ xfs_log_callback_t **ic_callback_tail;
+
+ /* reference counts need their own cacheline */
+ atomic_t ic_refcnt ____cacheline_aligned_in_smp;
} xlog_iclog_fields_t;
typedef union xlog_in_core2 {
@@ -366,6 +384,7 @@ typedef struct xlog_in_core {
#define ic_bp hic_fields.ic_bp
#define ic_log hic_fields.ic_log
#define ic_callback hic_fields.ic_callback
+#define ic_callback_lock hic_fields.ic_callback_lock
#define ic_callback_tail hic_fields.ic_callback_tail
#define ic_trace hic_fields.ic_trace
#define ic_size hic_fields.ic_size
@@ -383,43 +402,46 @@ typedef struct xlog_in_core {
* that round off problems won't occur when releasing partial reservations.
*/
typedef struct log {
+ /* The following fields don't need locking */
+ struct xfs_mount *l_mp; /* mount point */
+ struct xfs_buf *l_xbuf; /* extra buffer for log
+ * wrapping */
+ struct xfs_buftarg *l_targ; /* buftarg of log */
+ uint l_flags;
+ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+ struct xfs_buf_cancel **l_buf_cancel_table;
+ int l_iclog_hsize; /* size of iclog header */
+ int l_iclog_heads; /* # of iclog header sectors */
+ uint l_sectbb_log; /* log2 of sector size in BBs */
+ uint l_sectbb_mask; /* sector size (in BBs)
+ * alignment mask */
+ int l_iclog_size; /* size of log in bytes */
+ int l_iclog_size_log; /* log power size of log */
+ int l_iclog_bufs; /* number of iclog buffers */
+ xfs_daddr_t l_logBBstart; /* start block of log */
+ int l_logsize; /* size of log in bytes */
+ int l_logBBsize; /* size of log in BB chunks */
+
/* The following block of fields are changed while holding icloglock */
- sema_t l_flushsema; /* iclog flushing semaphore */
+ sema_t l_flushsema ____cacheline_aligned_in_smp;
+ /* iclog flushing semaphore */
int l_flushcnt; /* # of procs waiting on this
* sema */
- int l_ticket_cnt; /* free ticket count */
- int l_ticket_tcnt; /* total ticket count */
int l_covered_state;/* state of "covering disk
* log entries" */
- xlog_ticket_t *l_freelist; /* free list of tickets */
- xlog_ticket_t *l_unmount_free;/* kmem_free these addresses */
- xlog_ticket_t *l_tail; /* free list of tickets */
xlog_in_core_t *l_iclog; /* head log queue */
spinlock_t l_icloglock; /* grab to change iclog state */
xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
* buffers */
xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
- struct xfs_mount *l_mp; /* mount point */
- struct xfs_buf *l_xbuf; /* extra buffer for log
- * wrapping */
- struct xfs_buftarg *l_targ; /* buftarg of log */
- xfs_daddr_t l_logBBstart; /* start block of log */
- int l_logsize; /* size of log in bytes */
- int l_logBBsize; /* size of log in BB chunks */
int l_curr_cycle; /* Cycle number of log writes */
int l_prev_cycle; /* Cycle number before last
* block increment */
int l_curr_block; /* current logical log block */
int l_prev_block; /* previous logical log block */
- int l_iclog_size; /* size of log in bytes */
- int l_iclog_size_log; /* log power size of log */
- int l_iclog_bufs; /* number of iclog buffers */
-
- /* The following field are used for debugging; need to hold icloglock */
- char *l_iclog_bak[XLOG_MAX_ICLOGS];
/* The following block of fields are changed while holding grant_lock */
- spinlock_t l_grant_lock;
+ spinlock_t l_grant_lock ____cacheline_aligned_in_smp;
xlog_ticket_t *l_reserve_headq;
xlog_ticket_t *l_write_headq;
int l_grant_reserve_cycle;
@@ -427,19 +449,16 @@ typedef struct log {
int l_grant_write_cycle;
int l_grant_write_bytes;
- /* The following fields don't need locking */
#ifdef XFS_LOG_TRACE
struct ktrace *l_trace;
struct ktrace *l_grant_trace;
#endif
- uint l_flags;
- uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
- struct xfs_buf_cancel **l_buf_cancel_table;
- int l_iclog_hsize; /* size of iclog header */
- int l_iclog_heads; /* # of iclog header sectors */
- uint l_sectbb_log; /* log2 of sector size in BBs */
- uint l_sectbb_mask; /* sector size (in BBs)
- * alignment mask */
+
+ /* The following field are used for debugging; need to hold icloglock */
+#ifdef DEBUG
+ char *l_iclog_bak[XLOG_MAX_ICLOGS];
+#endif
+
} xlog_t;
#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
@@ -459,6 +478,8 @@ extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
extern void xlog_put_bp(struct xfs_buf *);
extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
+extern kmem_zone_t *xfs_log_ticket_zone;
+
/* iclog tracing */
#define XLOG_TRACE_GRAB_FLUSH 1
#define XLOG_TRACE_REL_FLUSH 2
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b2b70eba282..e65ab4af095 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -46,6 +46,7 @@
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
#include "xfs_rw.h"
+#include "xfs_utils.h"
STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
@@ -120,7 +121,8 @@ xlog_bread(
XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
xfsbdstrat(log->l_mp, bp);
- if ((error = xfs_iowait(bp)))
+ error = xfs_iowait(bp);
+ if (error)
xfs_ioerror_alert("xlog_bread", log->l_mp,
bp, XFS_BUF_ADDR(bp));
return error;
@@ -191,7 +193,7 @@ xlog_header_check_dump(
{
int b;
- cmn_err(CE_DEBUG, "%s: SB : uuid = ", __FUNCTION__);
+ cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__);
for (b = 0; b < 16; b++)
cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
@@ -1160,10 +1162,14 @@ xlog_write_log_records(
if (j == 0 && (start_block + endcount > ealign)) {
offset = XFS_BUF_PTR(bp);
balign = BBTOB(ealign - start_block);
- XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb));
- if ((error = xlog_bread(log, ealign, sectbb, bp)))
+ error = XFS_BUF_SET_PTR(bp, offset + balign,
+ BBTOB(sectbb));
+ if (!error)
+ error = xlog_bread(log, ealign, sectbb, bp);
+ if (!error)
+ error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+ if (error)
break;
- XFS_BUF_SET_PTR(bp, offset, bufblks);
}
offset = xlog_align(log, start_block, endcount, bp);
@@ -2280,7 +2286,9 @@ xlog_recover_do_inode_trans(
* invalidate the buffer when we write it out below.
*/
imap.im_blkno = 0;
- xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+ error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+ if (error)
+ goto error;
}
/*
@@ -2964,7 +2972,7 @@ xlog_recover_process_data(
* Process an extent free intent item that was recovered from
* the log. We need to free the extents that it describes.
*/
-STATIC void
+STATIC int
xlog_recover_process_efi(
xfs_mount_t *mp,
xfs_efi_log_item_t *efip)
@@ -2972,6 +2980,7 @@ xlog_recover_process_efi(
xfs_efd_log_item_t *efdp;
xfs_trans_t *tp;
int i;
+ int error = 0;
xfs_extent_t *extp;
xfs_fsblock_t startblock_fsb;
@@ -2995,23 +3004,32 @@ xlog_recover_process_efi(
* free the memory associated with it.
*/
xfs_efi_release(efip, efip->efi_format.efi_nextents);
- return;
+ return XFS_ERROR(EIO);
}
}
tp = xfs_trans_alloc(mp, 0);
- xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+ error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+ if (error)
+ goto abort_error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
extp = &(efip->efi_format.efi_extents[i]);
- xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+ error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+ if (error)
+ goto abort_error;
xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
extp->ext_len);
}
efip->efi_flags |= XFS_EFI_RECOVERED;
- xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp, 0);
+ return error;
+
+abort_error:
+ xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ return error;
}
/*
@@ -3059,7 +3077,7 @@ xlog_recover_check_ail(
* everything already in the AIL, we stop processing as soon as
* we see something other than an EFI in the AIL.
*/
-STATIC void
+STATIC int
xlog_recover_process_efis(
xlog_t *log)
{
@@ -3067,6 +3085,7 @@ xlog_recover_process_efis(
xfs_efi_log_item_t *efip;
int gen;
xfs_mount_t *mp;
+ int error = 0;
mp = log->l_mp;
spin_lock(&mp->m_ail_lock);
@@ -3091,11 +3110,14 @@ xlog_recover_process_efis(
}
spin_unlock(&mp->m_ail_lock);
- xlog_recover_process_efi(mp, efip);
+ error = xlog_recover_process_efi(mp, efip);
+ if (error)
+ return error;
spin_lock(&mp->m_ail_lock);
lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
}
spin_unlock(&mp->m_ail_lock);
+ return error;
}
/*
@@ -3115,21 +3137,18 @@ xlog_recover_clear_agi_bucket(
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
- xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
-
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+ if (!error)
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agibp);
- if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
- return;
- }
+ if (error)
+ goto out_abort;
+ error = EINVAL;
agi = XFS_BUF_TO_AGI(agibp);
- if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) {
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
- return;
- }
+ if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
+ goto out_abort;
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
offset = offsetof(xfs_agi_t, agi_unlinked) +
@@ -3137,7 +3156,17 @@ xlog_recover_clear_agi_bucket(
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
- (void) xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp, 0);
+ if (error)
+ goto out_error;
+ return;
+
+out_abort:
+ xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+out_error:
+ xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
+ "failed to clear agi %d. Continuing.", agno);
+ return;
}
/*
@@ -3214,7 +3243,8 @@ xlog_recover_process_iunlinks(
* next inode in the bucket.
*/
error = xfs_itobp(mp, NULL, ip, &dip,
- &ibp, 0, 0);
+ &ibp, 0, 0,
+ XFS_BUF_LOCK);
ASSERT(error || (dip != NULL));
}
@@ -3247,7 +3277,7 @@ xlog_recover_process_iunlinks(
if (ip->i_d.di_mode == 0)
xfs_iput_new(ip, 0);
else
- VN_RELE(XFS_ITOV(ip));
+ IRELE(ip);
} else {
/*
* We can't read in the inode
@@ -3445,7 +3475,7 @@ xlog_valid_rec_header(
(!rhead->h_version ||
(be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
xlog_warn("XFS: %s: unrecognised log version (%d).",
- __FUNCTION__, be32_to_cpu(rhead->h_version));
+ __func__, be32_to_cpu(rhead->h_version));
return XFS_ERROR(EIO);
}
@@ -3604,15 +3634,19 @@ xlog_do_recovery_pass(
* _first_, then the log start (LR header end)
* - order is important.
*/
+ wrapped_hblks = hblks - split_hblks;
bufaddr = XFS_BUF_PTR(hbp);
- XFS_BUF_SET_PTR(hbp,
+ error = XFS_BUF_SET_PTR(hbp,
bufaddr + BBTOB(split_hblks),
BBTOB(hblks - split_hblks));
- wrapped_hblks = hblks - split_hblks;
- error = xlog_bread(log, 0, wrapped_hblks, hbp);
+ if (!error)
+ error = xlog_bread(log, 0,
+ wrapped_hblks, hbp);
+ if (!error)
+ error = XFS_BUF_SET_PTR(hbp, bufaddr,
+ BBTOB(hblks));
if (error)
goto bread_err2;
- XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
if (!offset)
offset = xlog_align(log, 0,
wrapped_hblks, hbp);
@@ -3664,13 +3698,18 @@ xlog_do_recovery_pass(
* - order is important.
*/
bufaddr = XFS_BUF_PTR(dbp);
- XFS_BUF_SET_PTR(dbp,
+ error = XFS_BUF_SET_PTR(dbp,
bufaddr + BBTOB(split_bblks),
BBTOB(bblks - split_bblks));
- if ((error = xlog_bread(log, wrapped_hblks,
- bblks - split_bblks, dbp)))
+ if (!error)
+ error = xlog_bread(log, wrapped_hblks,
+ bblks - split_bblks,
+ dbp);
+ if (!error)
+ error = XFS_BUF_SET_PTR(dbp, bufaddr,
+ h_size);
+ if (error)
goto bread_err2;
- XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
if (!offset)
offset = xlog_align(log, wrapped_hblks,
bblks - split_bblks, dbp);
@@ -3826,7 +3865,8 @@ xlog_do_recover(
XFS_BUF_READ(bp);
XFS_BUF_UNASYNC(bp);
xfsbdstrat(log->l_mp, bp);
- if ((error = xfs_iowait(bp))) {
+ error = xfs_iowait(bp);
+ if (error) {
xfs_ioerror_alert("xlog_do_recover",
log->l_mp, bp, XFS_BUF_ADDR(bp));
ASSERT(0);
@@ -3917,7 +3957,14 @@ xlog_recover_finish(
* rather than accepting new requests.
*/
if (log->l_flags & XLOG_RECOVERY_NEEDED) {
- xlog_recover_process_efis(log);
+ int error;
+ error = xlog_recover_process_efis(log);
+ if (error) {
+ cmn_err(CE_ALERT,
+ "Failed to recover EFIs on filesystem: %s",
+ log->l_mp->m_fsname);
+ return error;
+ }
/*
* Sync the log to get all the EFIs out of the AIL.
* This isn't absolutely necessary, but it helps in
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8ed164eb954..2fec452afbc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,8 +43,9 @@
#include "xfs_rw.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
+#include "xfs_utils.h"
-STATIC void xfs_mount_log_sb(xfs_mount_t *, __int64_t);
+STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
STATIC int xfs_uuid_mount(xfs_mount_t *);
STATIC void xfs_uuid_unmount(xfs_mount_t *mp);
STATIC void xfs_unmountfs_wait(xfs_mount_t *);
@@ -57,7 +58,7 @@ STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
STATIC void xfs_icsb_sync_counters(xfs_mount_t *);
STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
int64_t, int);
-STATIC int xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
+STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
#else
@@ -956,7 +957,6 @@ xfs_mountfs(
{
xfs_sb_t *sbp = &(mp->m_sb);
xfs_inode_t *rip;
- bhv_vnode_t *rvp = NULL;
__uint64_t resblks;
__int64_t update_flags = 0LL;
uint quotamount, quotaflags;
@@ -964,11 +964,6 @@ xfs_mountfs(
int uuid_mounted = 0;
int error = 0;
- if (mp->m_sb_bp == NULL) {
- error = xfs_readsb(mp, mfsi_flags);
- if (error)
- return error;
- }
xfs_mount_common(mp, sbp);
/*
@@ -1163,7 +1158,6 @@ xfs_mountfs(
}
ASSERT(rip != NULL);
- rvp = XFS_ITOV(rip);
if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
cmn_err(CE_WARN, "XFS: corrupted root inode");
@@ -1195,8 +1189,13 @@ xfs_mountfs(
/*
* If fs is not mounted readonly, then update the superblock changes.
*/
- if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY))
- xfs_mount_log_sb(mp, update_flags);
+ if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ error = xfs_mount_log_sb(mp, update_flags);
+ if (error) {
+ cmn_err(CE_WARN, "XFS: failed to write sb changes");
+ goto error4;
+ }
+ }
/*
* Initialise the XFS quota management subsystem for this mount
@@ -1233,12 +1232,15 @@ xfs_mountfs(
*
* We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
* This may drive us straight to ENOSPC on mount, but that implies
- * we were already there on the last unmount.
+ * we were already there on the last unmount. Warn if this occurs.
*/
resblks = mp->m_sb.sb_dblocks;
do_div(resblks, 20);
resblks = min_t(__uint64_t, resblks, 1024);
- xfs_reserve_blocks(mp, &resblks, NULL);
+ error = xfs_reserve_blocks(mp, &resblks, NULL);
+ if (error)
+ cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
+ "Continuing without a reserve pool.");
return 0;
@@ -1246,7 +1248,7 @@ xfs_mountfs(
/*
* Free up the root inode.
*/
- VN_RELE(rvp);
+ IRELE(rip);
error3:
xfs_log_unmount_dealloc(mp);
error2:
@@ -1274,6 +1276,7 @@ int
xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
{
__uint64_t resblks;
+ int error = 0;
/*
* We can potentially deadlock here if we have an inode cluster
@@ -1317,9 +1320,15 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
* value does not matter....
*/
resblks = 0;
- xfs_reserve_blocks(mp, &resblks, NULL);
+ error = xfs_reserve_blocks(mp, &resblks, NULL);
+ if (error)
+ cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+ "Freespace may not be correct on next mount.");
- xfs_log_sbcount(mp, 1);
+ error = xfs_log_sbcount(mp, 1);
+ if (error)
+ cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+ "Freespace may not be correct on next mount.");
xfs_unmountfs_writesb(mp);
xfs_unmountfs_wait(mp); /* wait for async bufs */
xfs_log_unmount(mp); /* Done! No more fs ops. */
@@ -1411,9 +1420,8 @@ xfs_log_sbcount(
xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
if (sync)
xfs_trans_set_sync(tp);
- xfs_trans_commit(tp, 0);
-
- return 0;
+ error = xfs_trans_commit(tp, 0);
+ return error;
}
STATIC void
@@ -1462,7 +1470,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
XFS_BUF_UNASYNC(sbp);
ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
xfsbdstrat(mp, sbp);
- /* Nevermind errors we might get here. */
error = xfs_iowait(sbp);
if (error)
xfs_ioerror_alert("xfs_unmountfs_writesb",
@@ -1911,24 +1918,27 @@ xfs_uuid_unmount(
* be altered by the mount options, as well as any potential sb_features2
* fixup. Only the first superblock is updated.
*/
-STATIC void
+STATIC int
xfs_mount_log_sb(
xfs_mount_t *mp,
__int64_t fields)
{
xfs_trans_t *tp;
+ int error;
ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2));
tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
- if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
- XFS_DEFAULT_LOG_COUNT)) {
+ error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
+ if (error) {
xfs_trans_cancel(tp, 0);
- return;
+ return error;
}
xfs_mod_sb(tp, fields);
- xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp, 0);
+ return error;
}
@@ -2189,7 +2199,7 @@ xfs_icsb_counter_disabled(
return test_bit(field, &mp->m_icsb_counters);
}
-STATIC int
+STATIC void
xfs_icsb_disable_counter(
xfs_mount_t *mp,
xfs_sb_field_t field)
@@ -2207,7 +2217,7 @@ xfs_icsb_disable_counter(
* the m_icsb_mutex.
*/
if (xfs_icsb_counter_disabled(mp, field))
- return 0;
+ return;
xfs_icsb_lock_all_counters(mp);
if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
@@ -2230,8 +2240,6 @@ xfs_icsb_disable_counter(
}
xfs_icsb_unlock_all_counters(mp);
-
- return 0;
}
STATIC void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d8a4728d84..1ed575110ff 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,17 +66,17 @@ struct xfs_mru_cache;
* Prototypes and functions for the Data Migration subsystem.
*/
-typedef int (*xfs_send_data_t)(int, bhv_vnode_t *,
- xfs_off_t, size_t, int, bhv_vrwlock_t *);
+typedef int (*xfs_send_data_t)(int, struct xfs_inode *,
+ xfs_off_t, size_t, int, int *);
typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int (*xfs_send_destroy_t)(bhv_vnode_t *, dm_right_t);
+typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
- bhv_vnode_t *,
- dm_right_t, bhv_vnode_t *, dm_right_t,
- char *, char *, mode_t, int, int);
+ struct xfs_inode *, dm_right_t,
+ struct xfs_inode *, dm_right_t,
+ const char *, const char *, mode_t, int, int);
typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
char *, char *);
-typedef void (*xfs_send_unmount_t)(struct xfs_mount *, bhv_vnode_t *,
+typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
dm_right_t, mode_t, int, int);
typedef struct xfs_dmops {
@@ -88,20 +88,20 @@ typedef struct xfs_dmops {
xfs_send_unmount_t xfs_send_unmount;
} xfs_dmops_t;
-#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \
- (*(mp)->m_dm_ops->xfs_send_data)(ev,vp,off,len,fl,lock)
+#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
+ (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
#define XFS_SEND_MMAP(mp, vma,fl) \
(*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
-#define XFS_SEND_DESTROY(mp, vp,right) \
- (*(mp)->m_dm_ops->xfs_send_destroy)(vp,right)
+#define XFS_SEND_DESTROY(mp, ip,right) \
+ (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
(*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
(*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
#define XFS_SEND_MOUNT(mp,right,path,name) \
(*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_UNMOUNT(mp, vp,right,mode,rval,fl) \
- (*(mp)->m_dm_ops->xfs_send_unmount)(mp,vp,right,mode,rval,fl)
+#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \
+ (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
/*
@@ -220,7 +220,7 @@ extern void xfs_icsb_sync_counters_flags(struct xfs_mount *, int);
#endif
typedef struct xfs_ail {
- xfs_ail_entry_t xa_ail;
+ struct list_head xa_ail;
uint xa_gen;
struct task_struct *xa_task;
xfs_lsn_t xa_target;
@@ -401,7 +401,7 @@ typedef struct xfs_mount {
/*
* Allow large block sizes to be reported to userspace programs if the
- * "largeio" mount option is used.
+ * "largeio" mount option is used.
*
* If compatibility mode is specified, simply return the basic unit of caching
* so that we don't get inefficient read/modify/write I/O from user apps.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 7eb157a59f9..ee371890d85 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -36,7 +36,6 @@
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_quota.h"
-#include "xfs_refcache.h"
#include "xfs_utils.h"
#include "xfs_trans_space.h"
#include "xfs_vnodeops.h"
@@ -84,25 +83,23 @@ int xfs_rename_skip, xfs_rename_nskip;
*/
STATIC int
xfs_lock_for_rename(
- xfs_inode_t *dp1, /* old (source) directory inode */
- xfs_inode_t *dp2, /* new (target) directory inode */
- bhv_vname_t *vname1,/* old entry name */
- bhv_vname_t *vname2,/* new entry name */
- xfs_inode_t **ipp1, /* inode of old entry */
- xfs_inode_t **ipp2, /* inode of new entry, if it
+ xfs_inode_t *dp1, /* in: old (source) directory inode */
+ xfs_inode_t *dp2, /* in: new (target) directory inode */
+ xfs_inode_t *ip1, /* in: inode of old entry */
+ struct xfs_name *name2, /* in: new entry name */
+ xfs_inode_t **ipp2, /* out: inode of new entry, if it
already exists, NULL otherwise. */
- xfs_inode_t **i_tab,/* array of inode returned, sorted */
- int *num_inodes) /* number of inodes in array */
+ xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
+ int *num_inodes) /* out: number of inodes in array */
{
- xfs_inode_t *ip1, *ip2, *temp;
+ xfs_inode_t *ip2 = NULL;
+ xfs_inode_t *temp;
xfs_ino_t inum1, inum2;
int error;
int i, j;
uint lock_mode;
int diff_dirs = (dp1 != dp2);
- ip2 = NULL;
-
/*
* First, find out the current inums of the entries so that we
* can determine the initial locking order. We'll have to
@@ -110,27 +107,20 @@ xfs_lock_for_rename(
* to see if we still have the right inodes, directories, etc.
*/
lock_mode = xfs_ilock_map_shared(dp1);
- error = xfs_get_dir_entry(vname1, &ip1);
- if (error) {
- xfs_iunlock_map_shared(dp1, lock_mode);
- return error;
- }
+ IHOLD(ip1);
+ xfs_itrace_ref(ip1);
inum1 = ip1->i_ino;
- ASSERT(ip1);
- xfs_itrace_ref(ip1);
-
/*
* Unlock dp1 and lock dp2 if they are different.
*/
-
if (diff_dirs) {
xfs_iunlock_map_shared(dp1, lock_mode);
lock_mode = xfs_ilock_map_shared(dp2);
}
- error = xfs_dir_lookup_int(dp2, lock_mode, vname2, &inum2, &ip2);
+ error = xfs_dir_lookup_int(dp2, lock_mode, name2, &inum2, &ip2);
if (error == ENOENT) { /* target does not need to exist. */
inum2 = 0;
} else if (error) {
@@ -162,6 +152,7 @@ xfs_lock_for_rename(
*num_inodes = 4;
i_tab[3] = ip2;
}
+ *ipp2 = i_tab[3];
/*
* Sort the elements via bubble sort. (Remember, there are at
@@ -199,21 +190,6 @@ xfs_lock_for_rename(
xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
}
- /*
- * Set the return value. Null out any unused entries in i_tab.
- */
- *ipp1 = *ipp2 = NULL;
- for (i=0; i < *num_inodes; i++) {
- if (i_tab[i]->i_ino == inum1) {
- *ipp1 = i_tab[i];
- }
- if (i_tab[i]->i_ino == inum2) {
- *ipp2 = i_tab[i];
- }
- }
- for (;i < 4; i++) {
- i_tab[i] = NULL;
- }
return 0;
}
@@ -223,13 +199,13 @@ xfs_lock_for_rename(
int
xfs_rename(
xfs_inode_t *src_dp,
- bhv_vname_t *src_vname,
- bhv_vnode_t *target_dir_vp,
- bhv_vname_t *target_vname)
+ struct xfs_name *src_name,
+ xfs_inode_t *src_ip,
+ xfs_inode_t *target_dp,
+ struct xfs_name *target_name)
{
- bhv_vnode_t *src_dir_vp = XFS_ITOV(src_dp);
xfs_trans_t *tp;
- xfs_inode_t *target_dp, *src_ip, *target_ip;
+ xfs_inode_t *target_ip;
xfs_mount_t *mp = src_dp->i_mount;
int new_parent; /* moving to a new dir */
int src_is_directory; /* src_name is a directory */
@@ -243,29 +219,16 @@ xfs_rename(
int spaceres;
int target_link_zero = 0;
int num_inodes;
- char *src_name = VNAME(src_vname);
- char *target_name = VNAME(target_vname);
- int src_namelen = VNAMELEN(src_vname);
- int target_namelen = VNAMELEN(target_vname);
xfs_itrace_entry(src_dp);
- xfs_itrace_entry(xfs_vtoi(target_dir_vp));
-
- /*
- * Find the XFS behavior descriptor for the target directory
- * vnode since it was not handed to us.
- */
- target_dp = xfs_vtoi(target_dir_vp);
- if (target_dp == NULL) {
- return XFS_ERROR(EXDEV);
- }
+ xfs_itrace_entry(target_dp);
if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
- src_dir_vp, DM_RIGHT_NULL,
- target_dir_vp, DM_RIGHT_NULL,
- src_name, target_name,
+ src_dp, DM_RIGHT_NULL,
+ target_dp, DM_RIGHT_NULL,
+ src_name->name, target_name->name,
0, 0, 0);
if (error) {
return error;
@@ -282,10 +245,8 @@ xfs_rename(
* does not exist in the source directory.
*/
tp = NULL;
- error = xfs_lock_for_rename(src_dp, target_dp, src_vname,
- target_vname, &src_ip, &target_ip, inodes,
- &num_inodes);
-
+ error = xfs_lock_for_rename(src_dp, target_dp, src_ip, target_name,
+ &target_ip, inodes, &num_inodes);
if (error) {
/*
* We have nothing locked, no inode references, and
@@ -331,7 +292,7 @@ xfs_rename(
XFS_BMAP_INIT(&free_list, &first_block);
tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
+ spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
if (error == ENOSPC) {
@@ -365,10 +326,10 @@ xfs_rename(
* them when they unlock the inodes. Also, we need to be careful
* not to add an inode to the transaction more than once.
*/
- VN_HOLD(src_dir_vp);
+ IHOLD(src_dp);
xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
if (new_parent) {
- VN_HOLD(target_dir_vp);
+ IHOLD(target_dp);
xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
}
if ((src_ip != src_dp) && (src_ip != target_dp)) {
@@ -389,9 +350,8 @@ xfs_rename(
* If there's no space reservation, check the entry will
* fit before actually inserting it.
*/
- if (spaceres == 0 &&
- (error = xfs_dir_canenter(tp, target_dp, target_name,
- target_namelen)))
+ error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+ if (error)
goto error_return;
/*
* If target does not exist and the rename crosses
@@ -399,8 +359,8 @@ xfs_rename(
* to account for the ".." reference from the new entry.
*/
error = xfs_dir_createname(tp, target_dp, target_name,
- target_namelen, src_ip->i_ino,
- &first_block, &free_list, spaceres);
+ src_ip->i_ino, &first_block,
+ &free_list, spaceres);
if (error == ENOSPC)
goto error_return;
if (error)
@@ -439,7 +399,7 @@ xfs_rename(
* name at the destination directory, remove it first.
*/
error = xfs_dir_replace(tp, target_dp, target_name,
- target_namelen, src_ip->i_ino,
+ src_ip->i_ino,
&first_block, &free_list, spaceres);
if (error)
goto abort_return;
@@ -476,7 +436,8 @@ xfs_rename(
* Rewrite the ".." entry to point to the new
* directory.
*/
- error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino,
+ error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+ target_dp->i_ino,
&first_block, &free_list, spaceres);
ASSERT(error != EEXIST);
if (error)
@@ -512,8 +473,8 @@ xfs_rename(
goto abort_return;
}
- error = xfs_dir_removename(tp, src_dp, src_name, src_namelen,
- src_ip->i_ino, &first_block, &free_list, spaceres);
+ error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+ &first_block, &free_list, spaceres);
if (error)
goto abort_return;
xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -580,10 +541,8 @@ xfs_rename(
* the vnode references.
*/
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (target_ip != NULL) {
- xfs_refcache_purge_ip(target_ip);
+ if (target_ip != NULL)
IRELE(target_ip);
- }
/*
* Let interposed file systems know about removed links.
*/
@@ -598,9 +557,9 @@ std_return:
if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
(void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
- src_dir_vp, DM_RIGHT_NULL,
- target_dir_vp, DM_RIGHT_NULL,
- src_name, target_name,
+ src_dp, DM_RIGHT_NULL,
+ target_dp, DM_RIGHT_NULL,
+ src_name->name, target_name->name,
0, error, 0);
}
return error;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 47082c01872..a0dc6e5bc5b 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -44,6 +44,7 @@
#include "xfs_rw.h"
#include "xfs_inode_item.h"
#include "xfs_trans_space.h"
+#include "xfs_utils.h"
/*
@@ -123,14 +124,14 @@ xfs_growfs_rt_alloc(
XFS_GROWRTALLOC_LOG_RES(mp), 0,
XFS_TRANS_PERM_LOG_RES,
XFS_DEFAULT_PERM_LOG_COUNT)))
- goto error_exit;
+ goto error_cancel;
cancelflags = XFS_TRANS_RELEASE_LOG_RES;
/*
* Lock the inode.
*/
if ((error = xfs_trans_iget(mp, tp, ino, 0,
XFS_ILOCK_EXCL, &ip)))
- goto error_exit;
+ goto error_cancel;
XFS_BMAP_INIT(&flist, &firstblock);
/*
* Allocate blocks to the bitmap file.
@@ -143,14 +144,16 @@ xfs_growfs_rt_alloc(
if (!error && nmap < 1)
error = XFS_ERROR(ENOSPC);
if (error)
- goto error_exit;
+ goto error_cancel;
/*
* Free any blocks freed up in the transaction, then commit.
*/
error = xfs_bmap_finish(&tp, &flist, &committed);
if (error)
- goto error_exit;
- xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ goto error_cancel;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error;
/*
* Now we need to clear the allocated blocks.
* Do this one block per transaction, to keep it simple.
@@ -165,13 +168,13 @@ xfs_growfs_rt_alloc(
*/
if ((error = xfs_trans_reserve(tp, 0,
XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
- goto error_exit;
+ goto error_cancel;
/*
* Lock the bitmap inode.
*/
if ((error = xfs_trans_iget(mp, tp, ino, 0,
XFS_ILOCK_EXCL, &ip)))
- goto error_exit;
+ goto error_cancel;
/*
* Get a buffer for the block.
*/
@@ -180,14 +183,16 @@ xfs_growfs_rt_alloc(
mp->m_bsize, 0);
if (bp == NULL) {
error = XFS_ERROR(EIO);
- goto error_exit;
+ goto error_cancel;
}
memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
/*
* Commit the transaction.
*/
- xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp, 0);
+ if (error)
+ goto error;
}
/*
* Go on to the next extent, if any.
@@ -195,8 +200,9 @@ xfs_growfs_rt_alloc(
oblocks = map.br_startoff + map.br_blockcount;
}
return 0;
-error_exit:
+error_cancel:
xfs_trans_cancel(tp, cancelflags);
+error:
return error;
}
@@ -1875,6 +1881,7 @@ xfs_growfs_rt(
xfs_trans_t *tp; /* transaction pointer */
sbp = &mp->m_sb;
+ cancelflags = 0;
/*
* Initial error checking.
*/
@@ -2041,13 +2048,15 @@ xfs_growfs_rt(
*/
mp->m_rsumlevels = nrsumlevels;
mp->m_rsumsize = nrsumsize;
- /*
- * Commit the transaction.
- */
- xfs_trans_commit(tp, 0);
+
+ error = xfs_trans_commit(tp, 0);
+ if (error) {
+ tp = NULL;
+ break;
+ }
}
- if (error)
+ if (error && tp)
xfs_trans_cancel(tp, cancelflags);
/*
@@ -2278,7 +2287,7 @@ xfs_rtmount_inodes(
ASSERT(sbp->sb_rsumino != NULLFSINO);
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
if (error) {
- VN_RELE(XFS_ITOV(mp->m_rbmip));
+ IRELE(mp->m_rbmip);
return error;
}
ASSERT(mp->m_rsumip != NULL);
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index cd3ece6cc91..b0f31c09a76 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -126,11 +126,11 @@ xfs_write_sync_logforce(
* when we return.
*/
if (iip && iip->ili_last_lsn) {
- xfs_log_force(mp, iip->ili_last_lsn,
- XFS_LOG_FORCE | XFS_LOG_SYNC);
+ error = _xfs_log_force(mp, iip->ili_last_lsn,
+ XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
} else if (xfs_ipincount(ip) > 0) {
- xfs_log_force(mp, (xfs_lsn_t)0,
- XFS_LOG_FORCE | XFS_LOG_SYNC);
+ error = _xfs_log_force(mp, (xfs_lsn_t)0,
+ XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
}
} else {
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7f40628d85c..0804207c739 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -113,13 +113,8 @@ struct xfs_mount;
struct xfs_trans;
struct xfs_dquot_acct;
-typedef struct xfs_ail_entry {
- struct xfs_log_item *ail_forw; /* AIL forw pointer */
- struct xfs_log_item *ail_back; /* AIL back pointer */
-} xfs_ail_entry_t;
-
typedef struct xfs_log_item {
- xfs_ail_entry_t li_ail; /* AIL pointers */
+ struct list_head li_ail; /* AIL pointers */
xfs_lsn_t li_lsn; /* last on-disk lsn */
struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
struct xfs_mount *li_mountp; /* ptr to fs mount */
@@ -341,7 +336,6 @@ typedef struct xfs_trans {
unsigned int t_rtx_res; /* # of rt extents resvd */
unsigned int t_rtx_res_used; /* # of resvd rt extents used */
xfs_log_ticket_t t_ticket; /* log mgr ticket */
- sema_t t_sema; /* sema for commit completion */
xfs_lsn_t t_lsn; /* log seq num of start of
* transaction. */
xfs_lsn_t t_commit_lsn; /* log seq num of end of
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 76d470d8a1e..1f77c00af56 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,13 +28,13 @@
#include "xfs_trans_priv.h"
#include "xfs_error.h"
-STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
+STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
#ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
#else
#define xfs_ail_check(a,l)
#endif /* DEBUG */
@@ -57,7 +57,7 @@ xfs_trans_tail_ail(
xfs_log_item_t *lip;
spin_lock(&mp->m_ail_lock);
- lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+ lip = xfs_ail_min(&mp->m_ail);
if (lip == NULL) {
lsn = (xfs_lsn_t)0;
} else {
@@ -91,7 +91,7 @@ xfs_trans_push_ail(
{
xfs_log_item_t *lip;
- lip = xfs_ail_min(&mp->m_ail.xa_ail);
+ lip = xfs_ail_min(&mp->m_ail);
if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
xfsaild_wakeup(mp, threshold_lsn);
@@ -111,15 +111,17 @@ xfs_trans_first_push_ail(
{
xfs_log_item_t *lip;
- lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+ lip = xfs_ail_min(&mp->m_ail);
*gen = (int)mp->m_ail.xa_gen;
if (lsn == 0)
return lip;
- while (lip && (XFS_LSN_CMP(lip->li_lsn, lsn) < 0))
- lip = lip->li_ail.ail_forw;
+ list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
+ if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
+ return lip;
+ }
- return lip;
+ return NULL;
}
/*
@@ -329,7 +331,7 @@ xfs_trans_unlocked_item(
* the call to xfs_log_move_tail() doesn't do anything if there's
* not enough free space to wake people up so we're safe calling it.
*/
- min_lip = xfs_ail_min(&mp->m_ail.xa_ail);
+ min_lip = xfs_ail_min(&mp->m_ail);
if (min_lip == lip)
xfs_log_move_tail(mp, 1);
@@ -357,15 +359,13 @@ xfs_trans_update_ail(
xfs_log_item_t *lip,
xfs_lsn_t lsn) __releases(mp->m_ail_lock)
{
- xfs_ail_entry_t *ailp;
xfs_log_item_t *dlip=NULL;
xfs_log_item_t *mlip; /* ptr to minimum lip */
- ailp = &(mp->m_ail.xa_ail);
- mlip = xfs_ail_min(ailp);
+ mlip = xfs_ail_min(&mp->m_ail);
if (lip->li_flags & XFS_LI_IN_AIL) {
- dlip = xfs_ail_delete(ailp, lip);
+ dlip = xfs_ail_delete(&mp->m_ail, lip);
ASSERT(dlip == lip);
} else {
lip->li_flags |= XFS_LI_IN_AIL;
@@ -373,11 +373,11 @@ xfs_trans_update_ail(
lip->li_lsn = lsn;
- xfs_ail_insert(ailp, lip);
+ xfs_ail_insert(&mp->m_ail, lip);
mp->m_ail.xa_gen++;
if (mlip == dlip) {
- mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+ mlip = xfs_ail_min(&mp->m_ail);
spin_unlock(&mp->m_ail_lock);
xfs_log_move_tail(mp, mlip->li_lsn);
} else {
@@ -407,14 +407,12 @@ xfs_trans_delete_ail(
xfs_mount_t *mp,
xfs_log_item_t *lip) __releases(mp->m_ail_lock)
{
- xfs_ail_entry_t *ailp;
xfs_log_item_t *dlip;
xfs_log_item_t *mlip;
if (lip->li_flags & XFS_LI_IN_AIL) {
- ailp = &(mp->m_ail.xa_ail);
- mlip = xfs_ail_min(ailp);
- dlip = xfs_ail_delete(ailp, lip);
+ mlip = xfs_ail_min(&mp->m_ail);
+ dlip = xfs_ail_delete(&mp->m_ail, lip);
ASSERT(dlip == lip);
@@ -423,7 +421,7 @@ xfs_trans_delete_ail(
mp->m_ail.xa_gen++;
if (mlip == dlip) {
- mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+ mlip = xfs_ail_min(&mp->m_ail);
spin_unlock(&mp->m_ail_lock);
xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
} else {
@@ -440,7 +438,7 @@ xfs_trans_delete_ail(
else {
xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
"%s: attempting to delete a log item that is not in the AIL",
- __FUNCTION__);
+ __func__);
spin_unlock(&mp->m_ail_lock);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
@@ -461,7 +459,7 @@ xfs_trans_first_ail(
{
xfs_log_item_t *lip;
- lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+ lip = xfs_ail_min(&mp->m_ail);
*gen = (int)mp->m_ail.xa_gen;
return lip;
@@ -485,9 +483,9 @@ xfs_trans_next_ail(
ASSERT(mp && lip && gen);
if (mp->m_ail.xa_gen == *gen) {
- nlip = xfs_ail_next(&(mp->m_ail.xa_ail), lip);
+ nlip = xfs_ail_next(&mp->m_ail, lip);
} else {
- nlip = xfs_ail_min(&(mp->m_ail).xa_ail);
+ nlip = xfs_ail_min(&mp->m_ail);
*gen = (int)mp->m_ail.xa_gen;
if (restarts != NULL) {
XFS_STATS_INC(xs_push_ail_restarts);
@@ -517,8 +515,7 @@ int
xfs_trans_ail_init(
xfs_mount_t *mp)
{
- mp->m_ail.xa_ail.ail_forw = (xfs_log_item_t*)&mp->m_ail.xa_ail;
- mp->m_ail.xa_ail.ail_back = (xfs_log_item_t*)&mp->m_ail.xa_ail;
+ INIT_LIST_HEAD(&mp->m_ail.xa_ail);
return xfsaild_start(mp);
}
@@ -537,7 +534,7 @@ xfs_trans_ail_destroy(
*/
STATIC void
xfs_ail_insert(
- xfs_ail_entry_t *base,
+ xfs_ail_t *ailp,
xfs_log_item_t *lip)
/* ARGSUSED */
{
@@ -546,27 +543,22 @@ xfs_ail_insert(
/*
* If the list is empty, just insert the item.
*/
- if (base->ail_back == (xfs_log_item_t*)base) {
- base->ail_forw = lip;
- base->ail_back = lip;
- lip->li_ail.ail_forw = (xfs_log_item_t*)base;
- lip->li_ail.ail_back = (xfs_log_item_t*)base;
+ if (list_empty(&ailp->xa_ail)) {
+ list_add(&lip->li_ail, &ailp->xa_ail);
return;
}
- next_lip = base->ail_back;
- while ((next_lip != (xfs_log_item_t*)base) &&
- (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) {
- next_lip = next_lip->li_ail.ail_back;
+ list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
+ if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+ break;
}
- ASSERT((next_lip == (xfs_log_item_t*)base) ||
+
+ ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
(XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
- lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
- lip->li_ail.ail_back = next_lip;
- next_lip->li_ail.ail_forw = lip;
- lip->li_ail.ail_forw->li_ail.ail_back = lip;
- xfs_ail_check(base, lip);
+ list_add(&lip->li_ail, &next_lip->li_ail);
+
+ xfs_ail_check(ailp, lip);
return;
}
@@ -576,15 +568,13 @@ xfs_ail_insert(
/*ARGSUSED*/
STATIC xfs_log_item_t *
xfs_ail_delete(
- xfs_ail_entry_t *base,
+ xfs_ail_t *ailp,
xfs_log_item_t *lip)
/* ARGSUSED */
{
- xfs_ail_check(base, lip);
- lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back;
- lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw;
- lip->li_ail.ail_forw = NULL;
- lip->li_ail.ail_back = NULL;
+ xfs_ail_check(ailp, lip);
+
+ list_del(&lip->li_ail);
return lip;
}
@@ -595,14 +585,13 @@ xfs_ail_delete(
*/
STATIC xfs_log_item_t *
xfs_ail_min(
- xfs_ail_entry_t *base)
+ xfs_ail_t *ailp)
/* ARGSUSED */
{
- register xfs_log_item_t *forw = base->ail_forw;
- if (forw == (xfs_log_item_t*)base) {
+ if (list_empty(&ailp->xa_ail))
return NULL;
- }
- return forw;
+
+ return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
}
/*
@@ -612,15 +601,14 @@ xfs_ail_min(
*/
STATIC xfs_log_item_t *
xfs_ail_next(
- xfs_ail_entry_t *base,
+ xfs_ail_t *ailp,
xfs_log_item_t *lip)
/* ARGSUSED */
{
- if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) {
+ if (lip->li_ail.next == &ailp->xa_ail)
return NULL;
- }
- return lip->li_ail.ail_forw;
+ return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
}
#ifdef DEBUG
@@ -629,57 +617,40 @@ xfs_ail_next(
*/
STATIC void
xfs_ail_check(
- xfs_ail_entry_t *base,
+ xfs_ail_t *ailp,
xfs_log_item_t *lip)
{
xfs_log_item_t *prev_lip;
- prev_lip = base->ail_forw;
- if (prev_lip == (xfs_log_item_t*)base) {
- /*
- * Make sure the pointers are correct when the list
- * is empty.
- */
- ASSERT(base->ail_back == (xfs_log_item_t*)base);
+ if (list_empty(&ailp->xa_ail))
return;
- }
/*
* Check the next and previous entries are valid.
*/
ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
- prev_lip = lip->li_ail.ail_back;
- if (prev_lip != (xfs_log_item_t*)base) {
- ASSERT(prev_lip->li_ail.ail_forw == lip);
+ prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+ if (&prev_lip->li_ail != &ailp->xa_ail)
ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
- }
- prev_lip = lip->li_ail.ail_forw;
- if (prev_lip != (xfs_log_item_t*)base) {
- ASSERT(prev_lip->li_ail.ail_back == lip);
+
+ prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+ if (&prev_lip->li_ail != &ailp->xa_ail)
ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
- }
#ifdef XFS_TRANS_DEBUG
/*
- * Walk the list checking forward and backward pointers,
- * lsn ordering, and that every entry has the XFS_LI_IN_AIL
- * flag set. This is really expensive, so only do it when
- * specifically debugging the transaction subsystem.
+ * Walk the list checking lsn ordering, and that every entry has the
+ * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
+ * when specifically debugging the transaction subsystem.
*/
- prev_lip = (xfs_log_item_t*)base;
- while (lip != (xfs_log_item_t*)base) {
- if (prev_lip != (xfs_log_item_t*)base) {
- ASSERT(prev_lip->li_ail.ail_forw == lip);
+ prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+ list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+ if (&prev_lip->li_ail != &ailp->xa_ail)
ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
- }
- ASSERT(lip->li_ail.ail_back == prev_lip);
ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
prev_lip = lip;
- lip = lip->li_ail.ail_forw;
}
- ASSERT(lip == (xfs_log_item_t*)base);
- ASSERT(base->ail_back == prev_lip);
#endif /* XFS_TRANS_DEBUG */
}
#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 60b6b898022..cb0c5839154 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -304,7 +304,8 @@ xfs_trans_read_buf(
if (tp == NULL) {
bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
if (!bp)
- return XFS_ERROR(ENOMEM);
+ return (flags & XFS_BUF_TRYLOCK) ?
+ EAGAIN : XFS_ERROR(ENOMEM);
if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
xfs_ioerror_alert("xfs_trans_read_buf", mp,
@@ -353,17 +354,15 @@ xfs_trans_read_buf(
ASSERT(!XFS_BUF_ISASYNC(bp));
XFS_BUF_READ(bp);
xfsbdstrat(tp->t_mountp, bp);
- xfs_iowait(bp);
- if (XFS_BUF_GETERROR(bp) != 0) {
+ error = xfs_iowait(bp);
+ if (error) {
xfs_ioerror_alert("xfs_trans_read_buf", mp,
bp, blkno);
- error = XFS_BUF_GETERROR(bp);
xfs_buf_relse(bp);
/*
- * We can gracefully recover from most
- * read errors. Ones we can't are those
- * that happen after the transaction's
- * already dirty.
+ * We can gracefully recover from most read
+ * errors. Ones we can't are those that happen
+ * after the transaction's already dirty.
*/
if (tp->t_flags & XFS_TRANS_DIRTY)
xfs_force_shutdown(tp->t_mountp,
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 5c89be47546..0f5191644ab 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -160,4 +160,9 @@ typedef enum {
XFS_BTNUM_MAX
} xfs_btnum_t;
+struct xfs_name {
+ const char *name;
+ int len;
+};
+
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 18a85e74668..2b8dc7e4077 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -40,34 +40,12 @@
#include "xfs_itable.h"
#include "xfs_utils.h"
-/*
- * xfs_get_dir_entry is used to get a reference to an inode given
- * its parent directory inode and the name of the file. It does
- * not lock the child inode, and it unlocks the directory before
- * returning. The directory's generation number is returned for
- * use by a later call to xfs_lock_dir_and_entry.
- */
-int
-xfs_get_dir_entry(
- bhv_vname_t *dentry,
- xfs_inode_t **ipp)
-{
- bhv_vnode_t *vp;
-
- vp = VNAME_TO_VNODE(dentry);
-
- *ipp = xfs_vtoi(vp);
- if (!*ipp)
- return XFS_ERROR(ENOENT);
- VN_HOLD(vp);
- return 0;
-}
int
xfs_dir_lookup_int(
xfs_inode_t *dp,
uint lock_mode,
- bhv_vname_t *dentry,
+ struct xfs_name *name,
xfs_ino_t *inum,
xfs_inode_t **ipp)
{
@@ -75,7 +53,7 @@ xfs_dir_lookup_int(
xfs_itrace_entry(dp);
- error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum);
+ error = xfs_dir_lookup(NULL, dp, name, inum);
if (!error) {
/*
* Unlock the directory. We do this because we can't
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f857fcccb72..175b126d2ca 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,15 +21,14 @@
#define IRELE(ip) VN_RELE(XFS_ITOV(ip))
#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip))
-extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **);
-extern int xfs_dir_lookup_int (xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
- xfs_inode_t **);
-extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
-extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
+extern int xfs_dir_lookup_int(xfs_inode_t *, uint, struct xfs_name *,
+ xfs_ino_t *, xfs_inode_t **);
+extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
+extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
xfs_dev_t, cred_t *, prid_t, int,
xfs_inode_t **, int *);
-extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *);
-extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *);
-extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *);
+extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
+extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
+extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
#endif /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 7094caff13c..fc48158fe47 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -43,7 +43,6 @@
#include "xfs_error.h"
#include "xfs_bmap.h"
#include "xfs_rw.h"
-#include "xfs_refcache.h"
#include "xfs_buf_item.h"
#include "xfs_log_priv.h"
#include "xfs_dir2_trace.h"
@@ -56,6 +55,7 @@
#include "xfs_fsops.h"
#include "xfs_vnodeops.h"
#include "xfs_vfsops.h"
+#include "xfs_utils.h"
int __init
@@ -69,15 +69,17 @@ xfs_init(void)
/*
* Initialize all of the zone allocators we use.
*/
+ xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+ "xfs_log_ticket");
xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
- "xfs_bmap_free_item");
+ "xfs_bmap_free_item");
xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
- "xfs_btree_cur");
- xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
- xfs_da_state_zone =
- kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
+ "xfs_btree_cur");
+ xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+ "xfs_da_state");
xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+ xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
xfs_mru_cache_init();
xfs_filestream_init();
@@ -113,9 +115,6 @@ xfs_init(void)
xfs_ili_zone =
kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
KM_ZONE_SPREAD, NULL);
- xfs_icluster_zone =
- kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster",
- KM_ZONE_SPREAD, NULL);
/*
* Allocate global trace buffers.
@@ -153,11 +152,9 @@ xfs_cleanup(void)
extern kmem_zone_t *xfs_inode_zone;
extern kmem_zone_t *xfs_efd_zone;
extern kmem_zone_t *xfs_efi_zone;
- extern kmem_zone_t *xfs_icluster_zone;
xfs_cleanup_procfs();
xfs_sysctl_unregister();
- xfs_refcache_destroy();
xfs_filestream_uninit();
xfs_mru_cache_uninit();
xfs_acl_zone_destroy(xfs_acl_zone);
@@ -189,7 +186,6 @@ xfs_cleanup(void)
kmem_zone_destroy(xfs_efi_zone);
kmem_zone_destroy(xfs_ifork_zone);
kmem_zone_destroy(xfs_ili_zone);
- kmem_zone_destroy(xfs_icluster_zone);
}
/*
@@ -573,7 +569,7 @@ xfs_unmount(
#ifdef HAVE_DMAPI
if (mp->m_flags & XFS_MOUNT_DMAPI) {
error = XFS_SEND_PREUNMOUNT(mp,
- rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL,
+ rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
NULL, NULL, 0, 0,
(mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
0:DM_FLAGS_UNWANTED);
@@ -584,11 +580,6 @@ xfs_unmount(
0 : DM_FLAGS_UNWANTED;
}
#endif
- /*
- * First blow any referenced inode from this file system
- * out of the reference cache, and delete the timer.
- */
- xfs_refcache_purge_mp(mp);
/*
* Blow away any referenced inode in the filestreams cache.
@@ -607,7 +598,7 @@ xfs_unmount(
/*
* Drop the reference count
*/
- VN_RELE(rvp);
+ IRELE(rip);
/*
* If we're forcing a shutdown, typically because of a media error,
@@ -629,7 +620,7 @@ out:
/* Note: mp structure must still exist for
* XFS_SEND_UNMOUNT() call.
*/
- XFS_SEND_UNMOUNT(mp, error == 0 ? rvp : NULL,
+ XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL,
DM_RIGHT_NULL, 0, error, unmount_event_flags);
}
if (xfs_unmountfs_needed) {
@@ -646,13 +637,12 @@ out:
return XFS_ERROR(error);
}
-STATIC int
+STATIC void
xfs_quiesce_fs(
xfs_mount_t *mp)
{
int count = 0, pincount;
- xfs_refcache_purge_mp(mp);
xfs_flush_buftarg(mp->m_ddev_targp, 0);
xfs_finish_reclaim_all(mp, 0);
@@ -671,8 +661,6 @@ xfs_quiesce_fs(
count++;
}
} while (count < 2);
-
- return 0;
}
/*
@@ -684,6 +672,8 @@ void
xfs_attr_quiesce(
xfs_mount_t *mp)
{
+ int error = 0;
+
/* wait for all modifications to complete */
while (atomic_read(&mp->m_active_trans) > 0)
delay(100);
@@ -694,7 +684,11 @@ xfs_attr_quiesce(
ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
/* Push the superblock and write an unmount record */
- xfs_log_sbcount(mp, 1);
+ error = xfs_log_sbcount(mp, 1);
+ if (error)
+ xfs_fs_cmn_err(CE_WARN, mp,
+ "xfs_attr_quiesce: failed to log sb changes. "
+ "Frozen image may not be consistent.");
xfs_log_unmount_write(mp);
xfs_unmountfs_writesb(mp);
}
@@ -790,8 +784,8 @@ xfs_unmount_flush(
goto fscorrupt_out2;
if (rbmip) {
- VN_RELE(XFS_ITOV(rbmip));
- VN_RELE(XFS_ITOV(rsumip));
+ IRELE(rbmip);
+ IRELE(rsumip);
}
xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1169,10 +1163,10 @@ xfs_sync_inodes(
* above, then wait until after we've unlocked
* the inode to release the reference. This is
* because we can be already holding the inode
- * lock when VN_RELE() calls xfs_inactive().
+ * lock when IRELE() calls xfs_inactive().
*
* Make sure to drop the mount lock before calling
- * VN_RELE() so that we don't trip over ourselves if
+ * IRELE() so that we don't trip over ourselves if
* we have to go for the mount lock again in the
* inactive code.
*/
@@ -1180,7 +1174,7 @@ xfs_sync_inodes(
IPOINTER_INSERT(ip, mp);
}
- VN_RELE(vp);
+ IRELE(ip);
vnode_refed = B_FALSE;
}
@@ -1323,30 +1317,8 @@ xfs_syncsub(
}
/*
- * If this is the periodic sync, then kick some entries out of
- * the reference cache. This ensures that idle entries are
- * eventually kicked out of the cache.
- */
- if (flags & SYNC_REFCACHE) {
- if (flags & SYNC_WAIT)
- xfs_refcache_purge_mp(mp);
- else
- xfs_refcache_purge_some(mp);
- }
-
- /*
- * If asked, update the disk superblock with incore counter values if we
- * are using non-persistent counters so that they don't get too far out
- * of sync if we crash or get a forced shutdown. We don't want to force
- * this to disk, just get a transaction into the iclogs....
- */
- if (flags & SYNC_SUPER)
- xfs_log_sbcount(mp, 0);
-
- /*
* Now check to see if the log needs a "dummy" transaction.
*/
-
if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
xfs_trans_t *tp;
xfs_inode_t *ip;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 64c5953feca..6650601c64f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -48,7 +48,6 @@
#include "xfs_quota.h"
#include "xfs_utils.h"
#include "xfs_rtalloc.h"
-#include "xfs_refcache.h"
#include "xfs_trans_space.h"
#include "xfs_log_priv.h"
#include "xfs_filestream.h"
@@ -327,7 +326,7 @@ xfs_setattr(
if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
!(flags & ATTR_DMI)) {
int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
- code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
+ code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
vap->va_size, 0, dmflags, NULL);
if (code) {
lock_flags = 0;
@@ -634,6 +633,15 @@ xfs_setattr(
* Truncate file. Must have write permission and not be a directory.
*/
if (mask & XFS_AT_SIZE) {
+ /*
+ * Only change the c/mtime if we are changing the size
+ * or we are explicitly asked to change it. This handles
+ * the semantic difference between truncate() and ftruncate()
+ * as implemented in the VFS.
+ */
+ if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
+ timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+
if (vap->va_size > ip->i_size) {
xfs_igrow_finish(tp, ip, vap->va_size,
!(flags & ATTR_DMI));
@@ -662,10 +670,6 @@ xfs_setattr(
*/
xfs_iflags_set(ip, XFS_ITRUNCATED);
}
- /*
- * Have to do this even if the file's size doesn't change.
- */
- timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
}
/*
@@ -877,7 +881,7 @@ xfs_setattr(
if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
!(flags & ATTR_DMI)) {
- (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
+ (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
NULL, DM_RIGHT_NULL, NULL, NULL,
0, 0, AT_DELAY_FLAG(flags));
}
@@ -1443,28 +1447,22 @@ xfs_inactive_attrs(
tp = *tpp;
mp = ip->i_mount;
ASSERT(ip->i_d.di_forkoff != 0);
- xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ goto error_unlock;
error = xfs_attr_inactive(ip);
- if (error) {
- *tpp = NULL;
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error; /* goto out */
- }
+ if (error)
+ goto error_unlock;
tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
error = xfs_trans_reserve(tp, 0,
XFS_IFREE_LOG_RES(mp),
0, XFS_TRANS_PERM_LOG_RES,
XFS_INACTIVE_LOG_COUNT);
- if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
- *tpp = NULL;
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
- }
+ if (error)
+ goto error_cancel;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
@@ -1475,6 +1473,14 @@ xfs_inactive_attrs(
*tpp = tp;
return 0;
+
+error_cancel:
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+error_unlock:
+ *tpp = NULL;
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
}
int
@@ -1520,12 +1526,6 @@ xfs_release(
xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
}
-#ifdef HAVE_REFCACHE
- /* If we are in the NFS reference cache then don't do this now */
- if (ip->i_refcache)
- return 0;
-#endif
-
if (ip->i_d.di_nlink != 0) {
if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -1588,9 +1588,8 @@ xfs_inactive(
mp = ip->i_mount;
- if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
- (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
- }
+ if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
+ XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
error = 0;
@@ -1744,11 +1743,18 @@ xfs_inactive(
XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
/*
- * Just ignore errors at this point. There is
- * nothing we can do except to try to keep going.
+ * Just ignore errors at this point. There is nothing we can
+ * do except to try to keep going. Make sure it's not a silent
+ * error.
*/
- (void) xfs_bmap_finish(&tp, &free_list, &committed);
- (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+ "xfs_bmap_finish() returned error %d", error);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+ "xfs_trans_commit() returned error %d", error);
}
/*
* Release the dquots held by inode, if any.
@@ -1765,8 +1771,8 @@ xfs_inactive(
int
xfs_lookup(
xfs_inode_t *dp,
- bhv_vname_t *dentry,
- bhv_vnode_t **vpp)
+ struct xfs_name *name,
+ xfs_inode_t **ipp)
{
xfs_inode_t *ip;
xfs_ino_t e_inum;
@@ -1779,9 +1785,9 @@ xfs_lookup(
return XFS_ERROR(EIO);
lock_mode = xfs_ilock_map_shared(dp);
- error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
+ error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip);
if (!error) {
- *vpp = XFS_ITOV(ip);
+ *ipp = ip;
xfs_itrace_ref(ip);
}
xfs_iunlock_map_shared(dp, lock_mode);
@@ -1791,19 +1797,16 @@ xfs_lookup(
int
xfs_create(
xfs_inode_t *dp,
- bhv_vname_t *dentry,
+ struct xfs_name *name,
mode_t mode,
xfs_dev_t rdev,
- bhv_vnode_t **vpp,
+ xfs_inode_t **ipp,
cred_t *credp)
{
- char *name = VNAME(dentry);
- xfs_mount_t *mp = dp->i_mount;
- bhv_vnode_t *dir_vp = XFS_ITOV(dp);
+ xfs_mount_t *mp = dp->i_mount;
xfs_inode_t *ip;
- bhv_vnode_t *vp = NULL;
xfs_trans_t *tp;
- int error;
+ int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
boolean_t unlock_dp_on_error = B_FALSE;
@@ -1813,17 +1816,14 @@ xfs_create(
xfs_prid_t prid;
struct xfs_dquot *udqp, *gdqp;
uint resblks;
- int namelen;
- ASSERT(!*vpp);
+ ASSERT(!*ipp);
xfs_itrace_entry(dp);
- namelen = VNAMELEN(dentry);
-
if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
- dir_vp, DM_RIGHT_NULL, NULL,
- DM_RIGHT_NULL, name, NULL,
+ dp, DM_RIGHT_NULL, NULL,
+ DM_RIGHT_NULL, name->name, NULL,
mode, 0, 0);
if (error)
@@ -1855,7 +1855,7 @@ xfs_create(
tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- resblks = XFS_CREATE_SPACE_RES(mp, namelen);
+ resblks = XFS_CREATE_SPACE_RES(mp, name->len);
/*
* Initially assume that the file does not exist and
* reserve the resources for that case. If that is not
@@ -1888,7 +1888,8 @@ xfs_create(
if (error)
goto error_return;
- if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
+ error = xfs_dir_canenter(tp, dp, name, resblks);
+ if (error)
goto error_return;
error = xfs_dir_ialloc(&tp, dp, mode, 1,
rdev, credp, prid, resblks > 0,
@@ -1914,11 +1915,11 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- VN_HOLD(dir_vp);
+ IHOLD(dp);
xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = B_FALSE;
- error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
+ error = xfs_dir_createname(tp, dp, name, ip->i_ino,
&first_block, &free_list, resblks ?
resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
if (error) {
@@ -1952,7 +1953,6 @@ xfs_create(
* vnode to the caller, we bump the vnode ref count now.
*/
IHOLD(ip);
- vp = XFS_ITOV(ip);
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error) {
@@ -1970,17 +1970,17 @@ xfs_create(
XFS_QM_DQRELE(mp, udqp);
XFS_QM_DQRELE(mp, gdqp);
- *vpp = vp;
+ *ipp = ip;
/* Fallthrough to std_return with error = 0 */
std_return:
- if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
+ if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
- dir_vp, DM_RIGHT_NULL,
- *vpp ? vp:NULL,
- DM_RIGHT_NULL, name, NULL,
+ dp, DM_RIGHT_NULL,
+ *ipp ? ip : NULL,
+ DM_RIGHT_NULL, name->name, NULL,
mode, error, 0);
}
return error;
@@ -2272,46 +2272,32 @@ int remove_which_error_return = 0;
int
xfs_remove(
xfs_inode_t *dp,
- bhv_vname_t *dentry)
+ struct xfs_name *name,
+ xfs_inode_t *ip)
{
- bhv_vnode_t *dir_vp = XFS_ITOV(dp);
- char *name = VNAME(dentry);
xfs_mount_t *mp = dp->i_mount;
- xfs_inode_t *ip;
xfs_trans_t *tp = NULL;
int error = 0;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
int cancel_flags;
int committed;
- int dm_di_mode = 0;
int link_zero;
uint resblks;
- int namelen;
xfs_itrace_entry(dp);
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- namelen = VNAMELEN(dentry);
-
- if (!xfs_get_dir_entry(dentry, &ip)) {
- dm_di_mode = ip->i_d.di_mode;
- IRELE(ip);
- }
-
if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
- DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
- name, NULL, dm_di_mode, 0, 0);
+ error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
+ NULL, DM_RIGHT_NULL, name->name, NULL,
+ ip->i_d.di_mode, 0, 0);
if (error)
return error;
}
- /* From this point on, return through std_return */
- ip = NULL;
-
/*
* We need to get a reference to ip before we get our log
* reservation. The reason for this is that we cannot call
@@ -2324,13 +2310,7 @@ xfs_remove(
* when we call xfs_iget. Instead we get an unlocked reference
* to the inode before getting our log reservation.
*/
- error = xfs_get_dir_entry(dentry, &ip);
- if (error) {
- REMOVE_DEBUG_TRACE(__LINE__);
- goto std_return;
- }
-
- dm_di_mode = ip->i_d.di_mode;
+ IHOLD(ip);
xfs_itrace_entry(ip);
xfs_itrace_ref(ip);
@@ -2398,7 +2378,7 @@ xfs_remove(
* Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
*/
XFS_BMAP_INIT(&free_list, &first_block);
- error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
+ error = xfs_dir_removename(tp, dp, name, ip->i_ino,
&first_block, &free_list, 0);
if (error) {
ASSERT(error != ENOENT);
@@ -2449,14 +2429,6 @@ xfs_remove(
}
/*
- * Before we drop our extra reference to the inode, purge it
- * from the refcache if it is there. By waiting until afterwards
- * to do the IRELE, we ensure that we won't go inactive in the
- * xfs_refcache_purge_ip routine (although that would be OK).
- */
- xfs_refcache_purge_ip(ip);
-
- /*
* If we are using filestreams, kill the stream association.
* If the file is still open it may get a new one but that
* will get killed on last close in xfs_close() so we don't
@@ -2472,9 +2444,9 @@ xfs_remove(
std_return:
if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
- dir_vp, DM_RIGHT_NULL,
+ dp, DM_RIGHT_NULL,
NULL, DM_RIGHT_NULL,
- name, NULL, dm_di_mode, error, 0);
+ name->name, NULL, ip->i_d.di_mode, error, 0);
}
return error;
@@ -2495,14 +2467,6 @@ xfs_remove(
cancel_flags |= XFS_TRANS_ABORT;
xfs_trans_cancel(tp, cancel_flags);
- /*
- * Before we drop our extra reference to the inode, purge it
- * from the refcache if it is there. By waiting until afterwards
- * to do the IRELE, we ensure that we won't go inactive in the
- * xfs_refcache_purge_ip routine (although that would be OK).
- */
- xfs_refcache_purge_ip(ip);
-
IRELE(ip);
goto std_return;
@@ -2511,12 +2475,10 @@ xfs_remove(
int
xfs_link(
xfs_inode_t *tdp,
- bhv_vnode_t *src_vp,
- bhv_vname_t *dentry)
+ xfs_inode_t *sip,
+ struct xfs_name *target_name)
{
- bhv_vnode_t *target_dir_vp = XFS_ITOV(tdp);
xfs_mount_t *mp = tdp->i_mount;
- xfs_inode_t *sip = xfs_vtoi(src_vp);
xfs_trans_t *tp;
xfs_inode_t *ips[2];
int error;
@@ -2525,23 +2487,20 @@ xfs_link(
int cancel_flags;
int committed;
int resblks;
- char *target_name = VNAME(dentry);
- int target_namelen;
xfs_itrace_entry(tdp);
- xfs_itrace_entry(xfs_vtoi(src_vp));
+ xfs_itrace_entry(sip);
- target_namelen = VNAMELEN(dentry);
- ASSERT(!VN_ISDIR(src_vp));
+ ASSERT(!S_ISDIR(sip->i_d.di_mode));
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
- target_dir_vp, DM_RIGHT_NULL,
- src_vp, DM_RIGHT_NULL,
- target_name, NULL, 0, 0, 0);
+ tdp, DM_RIGHT_NULL,
+ sip, DM_RIGHT_NULL,
+ target_name->name, NULL, 0, 0, 0);
if (error)
return error;
}
@@ -2556,7 +2515,7 @@ xfs_link(
tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
+ resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
if (error == ENOSPC) {
@@ -2584,8 +2543,8 @@ xfs_link(
* xfs_trans_cancel will both unlock the inodes and
* decrement the associated ref counts.
*/
- VN_HOLD(src_vp);
- VN_HOLD(target_dir_vp);
+ IHOLD(sip);
+ IHOLD(tdp);
xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
@@ -2608,15 +2567,14 @@ xfs_link(
goto error_return;
}
- if (resblks == 0 &&
- (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
+ error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+ if (error)
goto error_return;
XFS_BMAP_INIT(&free_list, &first_block);
- error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
- sip->i_ino, &first_block, &free_list,
- resblks);
+ error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+ &first_block, &free_list, resblks);
if (error)
goto abort_return;
xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2650,9 +2608,9 @@ xfs_link(
std_return:
if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
- target_dir_vp, DM_RIGHT_NULL,
- src_vp, DM_RIGHT_NULL,
- target_name, NULL, 0, error, 0);
+ tdp, DM_RIGHT_NULL,
+ sip, DM_RIGHT_NULL,
+ target_name->name, NULL, 0, error, 0);
}
return error;
@@ -2669,17 +2627,13 @@ std_return:
int
xfs_mkdir(
xfs_inode_t *dp,
- bhv_vname_t *dentry,
+ struct xfs_name *dir_name,
mode_t mode,
- bhv_vnode_t **vpp,
+ xfs_inode_t **ipp,
cred_t *credp)
{
- bhv_vnode_t *dir_vp = XFS_ITOV(dp);
- char *dir_name = VNAME(dentry);
- int dir_namelen = VNAMELEN(dentry);
xfs_mount_t *mp = dp->i_mount;
xfs_inode_t *cdp; /* inode of created dir */
- bhv_vnode_t *cvp; /* vnode of created dir */
xfs_trans_t *tp;
int cancel_flags;
int error;
@@ -2700,8 +2654,8 @@ xfs_mkdir(
if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
- dir_vp, DM_RIGHT_NULL, NULL,
- DM_RIGHT_NULL, dir_name, NULL,
+ dp, DM_RIGHT_NULL, NULL,
+ DM_RIGHT_NULL, dir_name->name, NULL,
mode, 0, 0);
if (error)
return error;
@@ -2730,7 +2684,7 @@ xfs_mkdir(
tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
+ resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
if (error == ENOSPC) {
@@ -2762,8 +2716,8 @@ xfs_mkdir(
if (error)
goto error_return;
- if (resblks == 0 &&
- (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
+ error = xfs_dir_canenter(tp, dp, dir_name, resblks);
+ if (error)
goto error_return;
/*
* create the directory inode.
@@ -2786,15 +2740,15 @@ xfs_mkdir(
* from here on will result in the transaction cancel
* unlocking dp so don't do it explicitly in the error path.
*/
- VN_HOLD(dir_vp);
+ IHOLD(dp);
xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = B_FALSE;
XFS_BMAP_INIT(&free_list, &first_block);
- error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
- &first_block, &free_list, resblks ?
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
+ &first_block, &free_list, resblks ?
+ resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
if (error) {
ASSERT(error != ENOSPC);
goto error1;
@@ -2817,11 +2771,9 @@ xfs_mkdir(
if (error)
goto error2;
- cvp = XFS_ITOV(cdp);
-
created = B_TRUE;
- *vpp = cvp;
+ *ipp = cdp;
IHOLD(cdp);
/*
@@ -2858,10 +2810,10 @@ std_return:
if ((created || (error != 0 && dm_event_sent != 0)) &&
DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
- dir_vp, DM_RIGHT_NULL,
- created ? XFS_ITOV(cdp):NULL,
+ dp, DM_RIGHT_NULL,
+ created ? cdp : NULL,
DM_RIGHT_NULL,
- dir_name, NULL,
+ dir_name->name, NULL,
mode, error, 0);
}
return error;
@@ -2885,20 +2837,17 @@ std_return:
int
xfs_rmdir(
xfs_inode_t *dp,
- bhv_vname_t *dentry)
+ struct xfs_name *name,
+ xfs_inode_t *cdp)
{
bhv_vnode_t *dir_vp = XFS_ITOV(dp);
- char *name = VNAME(dentry);
- int namelen = VNAMELEN(dentry);
xfs_mount_t *mp = dp->i_mount;
- xfs_inode_t *cdp; /* child directory */
xfs_trans_t *tp;
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
int cancel_flags;
int committed;
- int dm_di_mode = S_IFDIR;
int last_cdp_link;
uint resblks;
@@ -2907,24 +2856,15 @@ xfs_rmdir(
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- if (!xfs_get_dir_entry(dentry, &cdp)) {
- dm_di_mode = cdp->i_d.di_mode;
- IRELE(cdp);
- }
-
if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
- dir_vp, DM_RIGHT_NULL,
- NULL, DM_RIGHT_NULL,
- name, NULL, dm_di_mode, 0, 0);
+ dp, DM_RIGHT_NULL,
+ NULL, DM_RIGHT_NULL, name->name,
+ NULL, cdp->i_d.di_mode, 0, 0);
if (error)
return XFS_ERROR(error);
}
- /* Return through std_return after this point. */
-
- cdp = NULL;
-
/*
* We need to get a reference to cdp before we get our log
* reservation. The reason for this is that we cannot call
@@ -2937,13 +2877,7 @@ xfs_rmdir(
* when we call xfs_iget. Instead we get an unlocked reference
* to the inode before getting our log reservation.
*/
- error = xfs_get_dir_entry(dentry, &cdp);
- if (error) {
- REMOVE_DEBUG_TRACE(__LINE__);
- goto std_return;
- }
- mp = dp->i_mount;
- dm_di_mode = cdp->i_d.di_mode;
+ IHOLD(cdp);
/*
* Get the dquots for the inodes.
@@ -3020,7 +2954,7 @@ xfs_rmdir(
goto error_return;
}
- error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
+ error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
&first_block, &free_list, resblks);
if (error)
goto error1;
@@ -3098,9 +3032,9 @@ xfs_rmdir(
std_return:
if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
- dir_vp, DM_RIGHT_NULL,
+ dp, DM_RIGHT_NULL,
NULL, DM_RIGHT_NULL,
- name, NULL, dm_di_mode,
+ name->name, NULL, cdp->i_d.di_mode,
error, 0);
}
return error;
@@ -3118,13 +3052,12 @@ xfs_rmdir(
int
xfs_symlink(
xfs_inode_t *dp,
- bhv_vname_t *dentry,
- char *target_path,
+ struct xfs_name *link_name,
+ const char *target_path,
mode_t mode,
- bhv_vnode_t **vpp,
+ xfs_inode_t **ipp,
cred_t *credp)
{
- bhv_vnode_t *dir_vp = XFS_ITOV(dp);
xfs_mount_t *mp = dp->i_mount;
xfs_trans_t *tp;
xfs_inode_t *ip;
@@ -3140,17 +3073,15 @@ xfs_symlink(
int nmaps;
xfs_bmbt_irec_t mval[SYMLINK_MAPS];
xfs_daddr_t d;
- char *cur_chunk;
+ const char *cur_chunk;
int byte_cnt;
int n;
xfs_buf_t *bp;
xfs_prid_t prid;
struct xfs_dquot *udqp, *gdqp;
uint resblks;
- char *link_name = VNAME(dentry);
- int link_namelen;
- *vpp = NULL;
+ *ipp = NULL;
error = 0;
ip = NULL;
tp = NULL;
@@ -3160,44 +3091,17 @@ xfs_symlink(
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- link_namelen = VNAMELEN(dentry);
-
/*
* Check component lengths of the target path name.
*/
pathlen = strlen(target_path);
if (pathlen >= MAXPATHLEN) /* total string too long */
return XFS_ERROR(ENAMETOOLONG);
- if (pathlen >= MAXNAMELEN) { /* is any component too long? */
- int len, total;
- char *path;
-
- for (total = 0, path = target_path; total < pathlen;) {
- /*
- * Skip any slashes.
- */
- while(*path == '/') {
- total++;
- path++;
- }
-
- /*
- * Count up to the next slash or end of path.
- * Error out if the component is bigger than MAXNAMELEN.
- */
- for(len = 0; *path != '/' && total < pathlen;total++, path++) {
- if (++len >= MAXNAMELEN) {
- error = ENAMETOOLONG;
- return error;
- }
- }
- }
- }
if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
+ error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
- link_name, target_path, 0, 0, 0);
+ link_name->name, target_path, 0, 0, 0);
if (error)
return error;
}
@@ -3229,7 +3133,7 @@ xfs_symlink(
fs_blocks = 0;
else
fs_blocks = XFS_B_TO_FSB(mp, pathlen);
- resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
+ resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
if (error == ENOSPC && fs_blocks == 0) {
@@ -3263,8 +3167,8 @@ xfs_symlink(
/*
* Check for ability to enter directory entry, if no space reserved.
*/
- if (resblks == 0 &&
- (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
+ error = xfs_dir_canenter(tp, dp, link_name, resblks);
+ if (error)
goto error_return;
/*
* Initialize the bmap freelist prior to calling either
@@ -3289,7 +3193,7 @@ xfs_symlink(
* transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- VN_HOLD(dir_vp);
+ IHOLD(dp);
xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = B_FALSE;
@@ -3356,8 +3260,8 @@ xfs_symlink(
/*
* Create the directory entry for the symlink.
*/
- error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
- &first_block, &free_list, resblks);
+ error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
+ &first_block, &free_list, resblks);
if (error)
goto error1;
xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3399,19 +3303,14 @@ xfs_symlink(
std_return:
if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
- dir_vp, DM_RIGHT_NULL,
- error ? NULL : XFS_ITOV(ip),
- DM_RIGHT_NULL, link_name, target_path,
- 0, error, 0);
+ dp, DM_RIGHT_NULL,
+ error ? NULL : ip,
+ DM_RIGHT_NULL, link_name->name,
+ target_path, 0, error, 0);
}
- if (!error) {
- bhv_vnode_t *vp;
-
- ASSERT(ip);
- vp = XFS_ITOV(ip);
- *vpp = vp;
- }
+ if (!error)
+ *ipp = ip;
return error;
error2:
@@ -3431,60 +3330,11 @@ std_return:
}
int
-xfs_rwlock(
- xfs_inode_t *ip,
- bhv_vrwlock_t locktype)
-{
- if (S_ISDIR(ip->i_d.di_mode))
- return 1;
- if (locktype == VRWLOCK_WRITE) {
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- } else if (locktype == VRWLOCK_TRY_READ) {
- return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
- } else if (locktype == VRWLOCK_TRY_WRITE) {
- return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
- } else {
- ASSERT((locktype == VRWLOCK_READ) ||
- (locktype == VRWLOCK_WRITE_DIRECT));
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
-
- return 1;
-}
-
-
-void
-xfs_rwunlock(
- xfs_inode_t *ip,
- bhv_vrwlock_t locktype)
-{
- if (S_ISDIR(ip->i_d.di_mode))
- return;
- if (locktype == VRWLOCK_WRITE) {
- /*
- * In the write case, we may have added a new entry to
- * the reference cache. This might store a pointer to
- * an inode to be released in this inode. If it is there,
- * clear the pointer and release the inode after unlocking
- * this one.
- */
- xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
- } else {
- ASSERT((locktype == VRWLOCK_READ) ||
- (locktype == VRWLOCK_WRITE_DIRECT));
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- }
- return;
-}
-
-
-int
xfs_inode_flush(
xfs_inode_t *ip,
int flags)
{
xfs_mount_t *mp = ip->i_mount;
- xfs_inode_log_item_t *iip = ip->i_itemp;
int error = 0;
if (XFS_FORCED_SHUTDOWN(mp))
@@ -3494,33 +3344,9 @@ xfs_inode_flush(
* Bypass inodes which have already been cleaned by
* the inode flush clustering code inside xfs_iflush
*/
- if ((ip->i_update_core == 0) &&
- ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
+ if (xfs_inode_clean(ip))
return 0;
- if (flags & FLUSH_LOG) {
- if (iip && iip->ili_last_lsn) {
- xlog_t *log = mp->m_log;
- xfs_lsn_t sync_lsn;
- int log_flags = XFS_LOG_FORCE;
-
- spin_lock(&log->l_grant_lock);
- sync_lsn = log->l_last_sync_lsn;
- spin_unlock(&log->l_grant_lock);
-
- if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
- if (flags & FLUSH_SYNC)
- log_flags |= XFS_LOG_SYNC;
- error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
- if (error)
- return error;
- }
-
- if (ip->i_update_core == 0)
- return 0;
- }
- }
-
/*
* We make this non-blocking if the inode is contended,
* return EAGAIN to indicate to the caller that they
@@ -3528,30 +3354,22 @@ xfs_inode_flush(
* blocking on inodes inside another operation right
* now, they get caught later by xfs_sync.
*/
- if (flags & FLUSH_INODE) {
- int flush_flags;
-
- if (flags & FLUSH_SYNC) {
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- xfs_iflock(ip);
- } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
- if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return EAGAIN;
- }
- } else {
+ if (flags & FLUSH_SYNC) {
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ xfs_iflock(ip);
+ } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
return EAGAIN;
}
-
- if (flags & FLUSH_SYNC)
- flush_flags = XFS_IFLUSH_SYNC;
- else
- flush_flags = XFS_IFLUSH_ASYNC;
-
- error = xfs_iflush(ip, flush_flags);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ } else {
+ return EAGAIN;
}
+ error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
+ : XFS_IFLUSH_ASYNC_NOBLOCK);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
return error;
}
@@ -3694,12 +3512,12 @@ xfs_finish_reclaim(
* We get the flush lock regardless, though, just to make sure
* we don't free it while it is being flushed.
*/
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- if (!locked) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
- }
+ if (!locked) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_iflock(ip);
+ }
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
if (ip->i_update_core ||
((ip->i_itemp != NULL) &&
(ip->i_itemp->ili_format.ilf_fields != 0))) {
@@ -3719,17 +3537,11 @@ xfs_finish_reclaim(
ASSERT(ip->i_update_core == 0);
ASSERT(ip->i_itemp == NULL ||
ip->i_itemp->ili_format.ilf_fields == 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- } else if (locked) {
- /*
- * We are not interested in doing an iflush if we're
- * in the process of shutting down the filesystem forcibly.
- * So, just reclaim the inode.
- */
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
reclaim:
xfs_ireclaim(ip);
return 0;
@@ -3845,9 +3657,8 @@ xfs_alloc_file_space(
end_dmi_offset = offset+len;
if (end_dmi_offset > ip->i_size)
end_dmi_offset = ip->i_size;
- error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
- offset, end_dmi_offset - offset,
- 0, NULL);
+ error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
+ end_dmi_offset - offset, 0, NULL);
if (error)
return error;
}
@@ -3956,8 +3767,8 @@ dmapi_enospc_check:
if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
- XFS_ITOV(ip), DM_RIGHT_NULL,
- XFS_ITOV(ip), DM_RIGHT_NULL,
+ ip, DM_RIGHT_NULL,
+ ip, DM_RIGHT_NULL,
NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
if (error == 0)
goto retry; /* Maybe DMAPI app. has made space */
@@ -4021,7 +3832,8 @@ xfs_zero_remaining_bytes(
XFS_BUF_READ(bp);
XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
xfsbdstrat(mp, bp);
- if ((error = xfs_iowait(bp))) {
+ error = xfs_iowait(bp);
+ if (error) {
xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
mp, bp, XFS_BUF_ADDR(bp));
break;
@@ -4033,7 +3845,8 @@ xfs_zero_remaining_bytes(
XFS_BUF_UNREAD(bp);
XFS_BUF_WRITE(bp);
xfsbdstrat(mp, bp);
- if ((error = xfs_iowait(bp))) {
+ error = xfs_iowait(bp);
+ if (error) {
xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
mp, bp, XFS_BUF_ADDR(bp));
break;
@@ -4102,7 +3915,7 @@ xfs_free_file_space(
DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
if (end_dmi_offset > ip->i_size)
end_dmi_offset = ip->i_size;
- error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
+ error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
offset, end_dmi_offset - offset,
AT_DELAY_FLAG(attr_flags), NULL);
if (error)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 4e3970f0e5e..24c53923dc2 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -23,31 +23,32 @@ int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
xfs_off_t stop);
int xfs_release(struct xfs_inode *ip);
int xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
- bhv_vnode_t **vpp);
-int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
- xfs_dev_t rdev, bhv_vnode_t **vpp, struct cred *credp);
-int xfs_remove(struct xfs_inode *dp, bhv_vname_t *dentry);
-int xfs_link(struct xfs_inode *tdp, bhv_vnode_t *src_vp,
- bhv_vname_t *dentry);
-int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry,
- mode_t mode, bhv_vnode_t **vpp, struct cred *credp);
-int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry);
+int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+ struct xfs_inode **ipp);
+int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
+ xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
+int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+ struct xfs_inode *ip);
+int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
+ struct xfs_name *target_name);
+int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
+ mode_t mode, struct xfs_inode **ipp, struct cred *credp);
+int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name,
+ struct xfs_inode *cdp);
int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
xfs_off_t *offset, filldir_t filldir);
-int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry,
- char *target_path, mode_t mode, bhv_vnode_t **vpp,
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
+ const char *target_path, mode_t mode, struct xfs_inode **ipp,
struct cred *credp);
-int xfs_rwlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
-void xfs_rwunlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
int xfs_inode_flush(struct xfs_inode *ip, int flags);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
int xfs_reclaim(struct xfs_inode *ip);
int xfs_change_file_space(struct xfs_inode *ip, int cmd,
xfs_flock64_t *bf, xfs_off_t offset,
struct cred *credp, int attr_flags);
-int xfs_rename(struct xfs_inode *src_dp, bhv_vname_t *src_vname,
- bhv_vnode_t *target_dir_vp, bhv_vname_t *target_vname);
+int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+ struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+ struct xfs_name *target_name);
int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
int *valuelenp, int flags, cred_t *cred);
int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,