summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/aio.c30
-rw-r--r--fs/attr.c8
-rw-r--r--fs/binfmt_elf.c12
-rw-r--r--fs/binfmt_elf_fdpic.c12
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/compat.c4
-rw-r--r--fs/dcache.c188
-rw-r--r--fs/debugfs/file.c128
-rw-r--r--fs/devpts/inode.c24
-rw-r--r--fs/dlm/ast.c3
-rw-r--r--fs/dlm/dlm_internal.h16
-rw-r--r--fs/dlm/lock.c541
-rw-r--r--fs/dlm/lock.h7
-rw-r--r--fs/dlm/lockspace.c20
-rw-r--r--fs/dlm/lowcomms.c28
-rw-r--r--fs/dlm/memory.c8
-rw-r--r--fs/dlm/rcom.c61
-rw-r--r--fs/dlm/recover.c73
-rw-r--r--fs/dlm/recoverd.c15
-rw-r--r--fs/dlm/requestqueue.c43
-rw-r--r--fs/ecryptfs/messaging.c2
-rw-r--r--fs/eventpoll.c90
-rw-r--r--fs/exec.c40
-rw-r--r--fs/ext2/balloc.c9
-rw-r--r--fs/ext2/ext2.h8
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c20
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c49
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/ext3/balloc.c5
-rw-r--r--fs/ext3/dir.c167
-rw-r--r--fs/ext3/ext3.h14
-rw-r--r--fs/ext3/hash.c4
-rw-r--r--fs/ext3/ialloc.c20
-rw-r--r--fs/ext3/inode.c32
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/super.c41
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/ext4.h4
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inode.c34
-rw-r--r--fs/ext4/migrate.c4
-rw-r--r--fs/ext4/namei.c5
-rw-r--r--fs/ext4/super.c44
-rw-r--r--fs/fcntl.c6
-rw-r--r--fs/gfs2/acl.c12
-rw-r--r--fs/gfs2/aops.c18
-rw-r--r--fs/gfs2/bmap.c10
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/file.c12
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/incore.h27
-rw-r--r--fs/gfs2/inode.h3
-rw-r--r--fs/gfs2/lock_dlm.c2
-rw-r--r--fs/gfs2/log.c103
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c520
-rw-r--r--fs/gfs2/lops.h14
-rw-r--r--fs/gfs2/main.c26
-rw-r--r--fs/gfs2/meta_io.c28
-rw-r--r--fs/gfs2/meta_io.h4
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c6
-rw-r--r--fs/gfs2/rgrp.c102
-rw-r--r--fs/gfs2/sys.c10
-rw-r--r--fs/gfs2/trace_gfs2.h16
-rw-r--r--fs/gfs2/trans.c44
-rw-r--r--fs/gfs2/util.c3
-rw-r--r--fs/gfs2/util.h3
-rw-r--r--fs/inode.c12
-rw-r--r--fs/ioprio.c18
-rw-r--r--fs/jbd/checkpoint.c23
-rw-r--r--fs/jbd/commit.c21
-rw-r--r--fs/jbd/journal.c206
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/libfs.c4
-rw-r--r--fs/locks.c2
-rw-r--r--fs/namei.c129
-rw-r--r--fs/nfs/dir.c5
-rw-r--r--fs/nfs/nfs3proc.c3
-rw-r--r--fs/nfs/nfs4proc.c3
-rw-r--r--fs/nfs/proc.c3
-rw-r--r--fs/nfsd/auth.c5
-rw-r--r--fs/nilfs2/namei.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/open.c18
-rw-r--r--fs/proc/array.c15
-rw-r--r--fs/proc/base.c93
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/pstore/Kconfig17
-rw-r--r--fs/pstore/Makefile3
-rw-r--r--fs/pstore/ram.c383
-rw-r--r--fs/pstore/ram_core.c532
-rw-r--r--fs/quota/dquot.c32
-rw-r--r--fs/reiserfs/super.c6
-rw-r--r--fs/stat.c61
-rw-r--r--fs/sysfs/dir.c37
-rw-r--r--fs/sysfs/inode.c4
-rw-r--r--fs/ubifs/Kconfig23
-rw-r--r--fs/ubifs/Makefile5
-rw-r--r--fs/ubifs/commit.c14
-rw-r--r--fs/ubifs/debug.c158
-rw-r--r--fs/ubifs/debug.h217
-rw-r--r--fs/ubifs/dir.c10
-rw-r--r--fs/ubifs/file.c4
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/io.c74
-rw-r--r--fs/ubifs/journal.c10
-rw-r--r--fs/ubifs/log.c18
-rw-r--r--fs/ubifs/lprops.c18
-rw-r--r--fs/ubifs/lpt.c78
-rw-r--r--fs/ubifs/lpt_commit.c86
-rw-r--r--fs/ubifs/master.c8
-rw-r--r--fs/ubifs/orphan.c25
-rw-r--r--fs/ubifs/recovery.c43
-rw-r--r--fs/ubifs/replay.c27
-rw-r--r--fs/ubifs/sb.c26
-rw-r--r--fs/ubifs/scan.c14
-rw-r--r--fs/ubifs/super.c31
-rw-r--r--fs/ubifs/tnc.c28
-rw-r--r--fs/ubifs/tnc_commit.c28
-rw-r--r--fs/ubifs/tnc_misc.c36
-rw-r--r--fs/ubifs/ubifs.h26
-rw-r--r--fs/ubifs/xattr.c10
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/ufs/super.c5
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/xfs_ag.h18
-rw-r--r--fs/xfs/xfs_alloc.c585
-rw-r--r--fs/xfs/xfs_alloc.h28
-rw-r--r--fs/xfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/xfs_aops.c218
-rw-r--r--fs/xfs/xfs_attr.c25
-rw-r--r--fs/xfs/xfs_attr_leaf.c3
-rw-r--r--fs/xfs/xfs_bmap.c32
-rw-r--r--fs/xfs/xfs_bmap.h3
-rw-r--r--fs/xfs/xfs_bmap_btree.c1
-rw-r--r--fs/xfs/xfs_btree.c1
-rw-r--r--fs/xfs/xfs_buf.c593
-rw-r--r--fs/xfs/xfs_buf.h96
-rw-r--r--fs/xfs/xfs_buf_item.c123
-rw-r--r--fs/xfs/xfs_da_btree.c17
-rw-r--r--fs/xfs/xfs_dfrag.c2
-rw-r--r--fs/xfs/xfs_dir2.c1
-rw-r--r--fs/xfs/xfs_dir2_block.c1
-rw-r--r--fs/xfs/xfs_dir2_data.c1
-rw-r--r--fs/xfs/xfs_dir2_leaf.c1
-rw-r--r--fs/xfs/xfs_dir2_node.c1
-rw-r--r--fs/xfs/xfs_dir2_sf.c1
-rw-r--r--fs/xfs/xfs_discard.c6
-rw-r--r--fs/xfs/xfs_dquot.c91
-rw-r--r--fs/xfs/xfs_dquot.h3
-rw-r--r--fs/xfs/xfs_dquot_item.c162
-rw-r--r--fs/xfs/xfs_error.c1
-rw-r--r--fs/xfs/xfs_export.c1
-rw-r--r--fs/xfs/xfs_extent_busy.c603
-rw-r--r--fs/xfs/xfs_extent_busy.h69
-rw-r--r--fs/xfs/xfs_extfree_item.c59
-rw-r--r--fs/xfs/xfs_file.c327
-rw-r--r--fs/xfs/xfs_fsops.c82
-rw-r--r--fs/xfs/xfs_ialloc.c10
-rw-r--r--fs/xfs/xfs_ialloc.h9
-rw-r--r--fs/xfs/xfs_ialloc_btree.c1
-rw-r--r--fs/xfs/xfs_iget.c24
-rw-r--r--fs/xfs/xfs_inode.c132
-rw-r--r--fs/xfs/xfs_inode.h5
-rw-r--r--fs/xfs/xfs_inode_item.c176
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h5
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c59
-rw-r--r--fs/xfs/xfs_iops.c15
-rw-r--r--fs/xfs/xfs_itable.c1
-rw-r--r--fs/xfs/xfs_log.c49
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_cil.c253
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c103
-rw-r--r--fs/xfs/xfs_message.c1
-rw-r--r--fs/xfs/xfs_mount.c77
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_qm.c196
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c1
-rw-r--r--fs/xfs/xfs_quotaops.c1
-rw-r--r--fs/xfs/xfs_rename.c1
-rw-r--r--fs/xfs/xfs_rtalloc.c10
-rw-r--r--fs/xfs/xfs_rw.c156
-rw-r--r--fs/xfs/xfs_rw.h47
-rw-r--r--fs/xfs/xfs_super.c49
-rw-r--r--fs/xfs/xfs_sync.c281
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h53
-rw-r--r--fs/xfs/xfs_trans.c7
-rw-r--r--fs/xfs/xfs_trans.h18
-rw-r--r--fs/xfs/xfs_trans_ail.c207
-rw-r--r--fs/xfs/xfs_trans_buf.c126
-rw-r--r--fs/xfs/xfs_trans_dquot.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c1
-rw-r--r--fs/xfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/xfs_trans_priv.h12
-rw-r--r--fs/xfs/xfs_types.h5
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_vnodeops.c31
209 files changed, 5940 insertions, 4865 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index e95d1b64082..02257420274 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -33,7 +33,7 @@ config ARCH_BINFMT_ELF_RANDOMIZE_PIE
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y
- depends on (FRV || BLACKFIN || (SUPERH32 && !MMU))
+ depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
help
ELF FDPIC binaries are based on ELF, but allow the individual load
segments of a binary to be located in memory independently of each
diff --git a/fs/aio.c b/fs/aio.c
index 67a6db3e1b6..e7f2fad7b4c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1456,6 +1456,10 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
if (ret < 0)
goto out;
+ ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret);
+ if (ret < 0)
+ goto out;
+
kiocb->ki_nr_segs = kiocb->ki_nbytes;
kiocb->ki_cur_seg = 0;
/* ki_nbytes/left now reflect bytes instead of segs */
@@ -1467,11 +1471,17 @@ out:
return ret;
}
-static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb)
{
+ int bytes;
+
+ bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left);
+ if (bytes < 0)
+ return bytes;
+
kiocb->ki_iovec = &kiocb->ki_inline_vec;
kiocb->ki_iovec->iov_base = kiocb->ki_buf;
- kiocb->ki_iovec->iov_len = kiocb->ki_left;
+ kiocb->ki_iovec->iov_len = bytes;
kiocb->ki_nr_segs = 1;
kiocb->ki_cur_seg = 0;
return 0;
@@ -1496,10 +1506,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
kiocb->ki_left)))
break;
- ret = security_file_permission(file, MAY_READ);
- if (unlikely(ret))
- break;
- ret = aio_setup_single_vector(kiocb);
+ ret = aio_setup_single_vector(READ, file, kiocb);
if (ret)
break;
ret = -EINVAL;
@@ -1514,10 +1521,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
kiocb->ki_left)))
break;
- ret = security_file_permission(file, MAY_WRITE);
- if (unlikely(ret))
- break;
- ret = aio_setup_single_vector(kiocb);
+ ret = aio_setup_single_vector(WRITE, file, kiocb);
if (ret)
break;
ret = -EINVAL;
@@ -1528,9 +1532,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
break;
- ret = security_file_permission(file, MAY_READ);
- if (unlikely(ret))
- break;
ret = aio_setup_vectored_rw(READ, kiocb, compat);
if (ret)
break;
@@ -1542,9 +1543,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
break;
- ret = security_file_permission(file, MAY_WRITE);
- if (unlikely(ret))
- break;
ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
if (ret)
break;
diff --git a/fs/attr.c b/fs/attr.c
index 73f69a6ce9e..584620e5dee 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -47,14 +47,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
/* Make sure a caller can chown. */
if ((ia_valid & ATTR_UID) &&
- (current_fsuid() != inode->i_uid ||
- attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
+ (!uid_eq(current_fsuid(), inode->i_uid) ||
+ !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN))
return -EPERM;
/* Make sure caller can chgrp. */
if ((ia_valid & ATTR_GID) &&
- (current_fsuid() != inode->i_uid ||
- (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
+ (!uid_eq(current_fsuid(), inode->i_uid) ||
+ (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
!capable(CAP_CHOWN))
return -EPERM;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 16f73541707..e658dd134b9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -226,10 +226,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
NEW_AUX_ENT(AT_BASE, interp_load_addr);
NEW_AUX_ENT(AT_FLAGS, 0);
NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
- NEW_AUX_ENT(AT_UID, cred->uid);
- NEW_AUX_ENT(AT_EUID, cred->euid);
- NEW_AUX_ENT(AT_GID, cred->gid);
- NEW_AUX_ENT(AT_EGID, cred->egid);
+ NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid));
+ NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
+ NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid));
+ NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
@@ -1356,8 +1356,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
psinfo->pr_flag = p->flags;
rcu_read_lock();
cred = __task_cred(p);
- SET_UID(psinfo->pr_uid, cred->uid);
- SET_GID(psinfo->pr_gid, cred->gid);
+ SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+ SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
rcu_read_unlock();
strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d390a0fffc6..3d77cf81ba3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -627,10 +627,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr);
NEW_AUX_ENT(AT_FLAGS, 0);
NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr);
- NEW_AUX_ENT(AT_UID, (elf_addr_t) cred->uid);
- NEW_AUX_ENT(AT_EUID, (elf_addr_t) cred->euid);
- NEW_AUX_ENT(AT_GID, (elf_addr_t) cred->gid);
- NEW_AUX_ENT(AT_EGID, (elf_addr_t) cred->egid);
+ NEW_AUX_ENT(AT_UID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid));
+ NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid));
+ NEW_AUX_ENT(AT_GID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid));
+ NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid));
NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
@@ -1421,8 +1421,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
psinfo->pr_flag = p->flags;
rcu_read_lock();
cred = __task_cred(p);
- SET_UID(psinfo->pr_uid, cred->uid);
- SET_GID(psinfo->pr_gid, cred->gid);
+ SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+ SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
rcu_read_unlock();
strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a7ffc88a7db..e1fe74a2ce1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2753,7 +2753,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
* one reference for us, and we leave it for the
* caller
*/
- device->flush_bio = NULL;;
+ device->flush_bio = NULL;
bio = bio_alloc(GFP_NOFS, 0);
if (!bio)
return -ENOMEM;
diff --git a/fs/compat.c b/fs/compat.c
index f2944ace7a7..0781e619a62 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -144,8 +144,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
tmp.st_nlink = stat->nlink;
if (tmp.st_nlink != stat->nlink)
return -EOVERFLOW;
- SET_UID(tmp.st_uid, stat->uid);
- SET_GID(tmp.st_gid, stat->gid);
+ SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+ SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
tmp.st_rdev = old_encode_dev(stat->rdev);
if ((u64) stat->size > MAX_NON_LFS)
return -EOVERFLOW;
diff --git a/fs/dcache.c b/fs/dcache.c
index b80531c9177..4435d8b3290 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -153,16 +153,12 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
* In contrast, 'ct' and 'tcount' can be from a pathname, and do
* need the careful unaligned handling.
*/
-static inline int dentry_cmp(const unsigned char *cs, size_t scount,
- const unsigned char *ct, size_t tcount)
+static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
unsigned long a,b,mask;
- if (unlikely(scount != tcount))
- return 1;
-
for (;;) {
- a = load_unaligned_zeropad(cs);
+ a = *(unsigned long *)cs;
b = load_unaligned_zeropad(ct);
if (tcount < sizeof(unsigned long))
break;
@@ -180,12 +176,8 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
#else
-static inline int dentry_cmp(const unsigned char *cs, size_t scount,
- const unsigned char *ct, size_t tcount)
+static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
- if (scount != tcount)
- return 1;
-
do {
if (*cs != *ct)
return 1;
@@ -198,6 +190,30 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
#endif
+static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
+{
+ const unsigned char *cs;
+ /*
+ * Be careful about RCU walk racing with rename:
+ * use ACCESS_ONCE to fetch the name pointer.
+ *
+ * NOTE! Even if a rename will mean that the length
+ * was not loaded atomically, we don't care. The
+ * RCU walk will check the sequence count eventually,
+ * and catch it. And we won't overrun the buffer,
+ * because we're reading the name pointer atomically,
+ * and a dentry name is guaranteed to be properly
+ * terminated with a NUL byte.
+ *
+ * End result: even if 'len' is wrong, we'll exit
+ * early because the data cannot match (there can
+ * be no NUL in the ct/tcount data)
+ */
+ cs = ACCESS_ONCE(dentry->d_name.name);
+ smp_read_barrier_depends();
+ return dentry_string_cmp(cs, ct, tcount);
+}
+
static void __d_free(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
@@ -1258,6 +1274,13 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
if (!dentry)
return NULL;
+ /*
+ * We guarantee that the inline name is always NUL-terminated.
+ * This way the memcpy() done by the name switching in rename
+ * will still always have a NUL at the end, even if we might
+ * be overwriting an internal NUL character
+ */
+ dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (name->len > DNAME_INLINE_LEN-1) {
dname = kmalloc(name->len + 1, GFP_KERNEL);
if (!dname) {
@@ -1267,13 +1290,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
} else {
dname = dentry->d_iname;
}
- dentry->d_name.name = dname;
dentry->d_name.len = name->len;
dentry->d_name.hash = name->hash;
memcpy(dname, name->name, name->len);
dname[name->len] = 0;
+ /* Make sure we always see the terminating NUL character */
+ smp_wmb();
+ dentry->d_name.name = dname;
+
dentry->d_count = 1;
dentry->d_flags = 0;
spin_lock_init(&dentry->d_lock);
@@ -1439,18 +1465,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
}
list_for_each_entry(alias, &inode->i_dentry, d_alias) {
- struct qstr *qstr = &alias->d_name;
-
/*
* Don't need alias->d_lock here, because aliases with
* d_parent == entry->d_parent are not subject to name or
* parent changes, because the parent inode i_mutex is held.
*/
- if (qstr->hash != hash)
+ if (alias->d_name.hash != hash)
continue;
if (alias->d_parent != entry->d_parent)
continue;
- if (dentry_cmp(qstr->name, qstr->len, name, len))
+ if (alias->d_name.len != len)
+ continue;
+ if (dentry_cmp(alias, name, len))
continue;
__dget(alias);
return alias;
@@ -1489,7 +1515,7 @@ struct dentry *d_make_root(struct inode *root_inode)
struct dentry *res = NULL;
if (root_inode) {
- static const struct qstr name = { .name = "/", .len = 1 };
+ static const struct qstr name = QSTR_INIT("/", 1);
res = __d_alloc(root_inode->i_sb, &name);
if (res)
@@ -1727,6 +1753,48 @@ err_out:
}
EXPORT_SYMBOL(d_add_ci);
+/*
+ * Do the slow-case of the dentry name compare.
+ *
+ * Unlike the dentry_cmp() function, we need to atomically
+ * load the name, length and inode information, so that the
+ * filesystem can rely on them, and can use the 'name' and
+ * 'len' information without worrying about walking off the
+ * end of memory etc.
+ *
+ * Thus the read_seqcount_retry() and the "duplicate" info
+ * in arguments (the low-level filesystem should not look
+ * at the dentry inode or name contents directly, since
+ * rename can change them while we're in RCU mode).
+ */
+enum slow_d_compare {
+ D_COMP_OK,
+ D_COMP_NOMATCH,
+ D_COMP_SEQRETRY,
+};
+
+static noinline enum slow_d_compare slow_dentry_cmp(
+ const struct dentry *parent,
+ struct inode *inode,
+ struct dentry *dentry,
+ unsigned int seq,
+ const struct qstr *name)
+{
+ int tlen = dentry->d_name.len;
+ const char *tname = dentry->d_name.name;
+ struct inode *i = dentry->d_inode;
+
+ if (read_seqcount_retry(&dentry->d_seq, seq)) {
+ cpu_relax();
+ return D_COMP_SEQRETRY;
+ }
+ if (parent->d_op->d_compare(parent, inode,
+ dentry, i,
+ tlen, tname, name))
+ return D_COMP_NOMATCH;
+ return D_COMP_OK;
+}
+
/**
* __d_lookup_rcu - search for a dentry (racy, store-free)
* @parent: parent dentry
@@ -1753,15 +1821,17 @@ EXPORT_SYMBOL(d_add_ci);
* the returned dentry, so long as its parent's seqlock is checked after the
* child is looked up. Thus, an interlocking stepping of sequence lock checks
* is formed, giving integrity down the path walk.
+ *
+ * NOTE! The caller *has* to check the resulting dentry against the sequence
+ * number we've returned before using any of the resulting dentry state!
*/
struct dentry *__d_lookup_rcu(const struct dentry *parent,
const struct qstr *name,
- unsigned *seqp, struct inode **inode)
+ unsigned *seqp, struct inode *inode)
{
- unsigned int len = name->len;
- unsigned int hash = name->hash;
+ u64 hashlen = name->hash_len;
const unsigned char *str = name->name;
- struct hlist_bl_head *b = d_hash(parent, hash);
+ struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen));
struct hlist_bl_node *node;
struct dentry *dentry;
@@ -1787,49 +1857,47 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
*/
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
unsigned seq;
- struct inode *i;
- const char *tname;
- int tlen;
-
- if (dentry->d_name.hash != hash)
- continue;
seqretry:
- seq = read_seqcount_begin(&dentry->d_seq);
+ /*
+ * The dentry sequence count protects us from concurrent
+ * renames, and thus protects inode, parent and name fields.
+ *
+ * The caller must perform a seqcount check in order
+ * to do anything useful with the returned dentry,
+ * including using the 'd_inode' pointer.
+ *
+ * NOTE! We do a "raw" seqcount_begin here. That means that
+ * we don't wait for the sequence count to stabilize if it
+ * is in the middle of a sequence change. If we do the slow
+ * dentry compare, we will do seqretries until it is stable,
+ * and if we end up with a successful lookup, we actually
+ * want to exit RCU lookup anyway.
+ */
+ seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
if (d_unhashed(dentry))
continue;
- tlen = dentry->d_name.len;
- tname = dentry->d_name.name;
- i = dentry->d_inode;
- prefetch(tname);
- /*
- * This seqcount check is required to ensure name and
- * len are loaded atomically, so as not to walk off the
- * edge of memory when walking. If we could load this
- * atomically some other way, we could drop this check.
- */
- if (read_seqcount_retry(&dentry->d_seq, seq))
- goto seqretry;
+ *seqp = seq;
+
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
- if (parent->d_op->d_compare(parent, *inode,
- dentry, i,
- tlen, tname, name))
+ if (dentry->d_name.hash != hashlen_hash(hashlen))
continue;
- } else {
- if (dentry_cmp(tname, tlen, str, len))
+ switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) {
+ case D_COMP_OK:
+ return dentry;
+ case D_COMP_NOMATCH:
continue;
+ default:
+ goto seqretry;
+ }
}
- /*
- * No extra seqcount check is required after the name
- * compare. The caller must perform a seqcount check in
- * order to do anything useful with the returned dentry
- * anyway.
- */
- *seqp = seq;
- *inode = i;
- return dentry;
+
+ if (dentry->d_name.hash_len != hashlen)
+ continue;
+ if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
+ return dentry;
}
return NULL;
}
@@ -1908,8 +1976,6 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
rcu_read_lock();
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
- const char *tname;
- int tlen;
if (dentry->d_name.hash != hash)
continue;
@@ -1924,15 +1990,17 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
* It is safe to compare names since d_move() cannot
* change the qstr (protected by d_lock).
*/
- tlen = dentry->d_name.len;
- tname = dentry->d_name.name;
if (parent->d_flags & DCACHE_OP_COMPARE) {
+ int tlen = dentry->d_name.len;
+ const char *tname = dentry->d_name.name;
if (parent->d_op->d_compare(parent, parent->d_inode,
dentry, dentry->d_inode,
tlen, tname, name))
goto next;
} else {
- if (dentry_cmp(tname, tlen, str, len))
+ if (dentry->d_name.len != len)
+ goto next;
+ if (dentry_cmp(dentry, str, len))
goto next;
}
@@ -3025,6 +3093,7 @@ static void __init dcache_init_early(void)
HASH_EARLY,
&d_hash_shift,
&d_hash_mask,
+ 0,
0);
for (loop = 0; loop < (1U << d_hash_shift); loop++)
@@ -3055,6 +3124,7 @@ static void __init dcache_init(void)
0,
&d_hash_shift,
&d_hash_mask,
+ 0,
0);
for (loop = 0; loop < (1U << d_hash_shift); loop++)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 5dfafdd1dbd..2340f6978d6 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -20,6 +20,7 @@
#include <linux/namei.h>
#include <linux/debugfs.h>
#include <linux/io.h>
+#include <linux/slab.h>
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -520,6 +521,133 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_blob);
+struct array_data {
+ void *array;
+ u32 elements;
+};
+
+static int u32_array_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return nonseekable_open(inode, file);
+}
+
+static size_t format_array(char *buf, size_t bufsize, const char *fmt,
+ u32 *array, u32 array_size)
+{
+ size_t ret = 0;
+ u32 i;
+
+ for (i = 0; i < array_size; i++) {
+ size_t len;
+
+ len = snprintf(buf, bufsize, fmt, array[i]);
+ len++; /* ' ' or '\n' */
+ ret += len;
+
+ if (buf) {
+ buf += len;
+ bufsize -= len;
+ buf[-1] = (i == array_size-1) ? '\n' : ' ';
+ }
+ }
+
+ ret++; /* \0 */
+ if (buf)
+ *buf = '\0';
+
+ return ret;
+}
+
+static char *format_array_alloc(const char *fmt, u32 *array,
+ u32 array_size)
+{
+ size_t len = format_array(NULL, 0, fmt, array, array_size);
+ char *ret;
+
+ ret = kmalloc(len, GFP_KERNEL);
+ if (ret == NULL)
+ return NULL;
+
+ format_array(ret, len, fmt, array, array_size);
+ return ret;
+}
+
+static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct array_data *data = inode->i_private;
+ size_t size;
+
+ if (*ppos == 0) {
+ if (file->private_data) {
+ kfree(file->private_data);
+ file->private_data = NULL;
+ }
+
+ file->private_data = format_array_alloc("%u", data->array,
+ data->elements);
+ }
+
+ size = 0;
+ if (file->private_data)
+ size = strlen(file->private_data);
+
+ return simple_read_from_buffer(buf, len, ppos,
+ file->private_data, size);
+}
+
+static int u32_array_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+
+ return 0;
+}
+
+static const struct file_operations u32_array_fops = {
+ .owner = THIS_MODULE,
+ .open = u32_array_open,
+ .release = u32_array_release,
+ .read = u32_array_read,
+ .llseek = no_llseek,
+};
+
+/**
+ * debugfs_create_u32_array - create a debugfs file that is used to read u32
+ * array.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @array: u32 array that provides data.
+ * @elements: total number of elements in the array.
+ *
+ * This function creates a file in debugfs with the given name that exports
+ * @array as data. If the @mode variable is so set it can be read from.
+ * Writing is not supported. Seek within the file is also not supported.
+ * Once array is created its size can not be changed.
+ *
+ * The function returns a pointer to dentry on success. If debugfs is not
+ * enabled in the kernel, the value -%ENODEV will be returned.
+ */
+struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+ struct dentry *parent,
+ u32 *array, u32 elements)
+{
+ struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+ if (data == NULL)
+ return NULL;
+
+ data->array = array;
+ data->elements = elements;
+
+ return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u32_array);
+
#ifdef CONFIG_HAS_IOMEM
/*
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 10f5e0b484d..979c1e309c7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -98,8 +98,8 @@ static struct vfsmount *devpts_mnt;
struct pts_mount_opts {
int setuid;
int setgid;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
umode_t mode;
umode_t ptmxmode;
int newinstance;
@@ -158,11 +158,13 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
{
char *p;
+ kuid_t uid;
+ kgid_t gid;
opts->setuid = 0;
opts->setgid = 0;
- opts->uid = 0;
- opts->gid = 0;
+ opts->uid = GLOBAL_ROOT_UID;
+ opts->gid = GLOBAL_ROOT_GID;
opts->mode = DEVPTS_DEFAULT_MODE;
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
opts->max = NR_UNIX98_PTY_MAX;
@@ -184,13 +186,19 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
case Opt_uid:
if (match_int(&args[0], &option))
return -EINVAL;
- opts->uid = option;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ opts->uid = uid;
opts->setuid = 1;
break;
case Opt_gid:
if (match_int(&args[0], &option))
return -EINVAL;
- opts->gid = option;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ opts->gid = gid;
opts->setgid = 1;
break;
case Opt_mode:
@@ -315,9 +323,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
struct pts_mount_opts *opts = &fsi->mount_opts;
if (opts->setuid)
- seq_printf(seq, ",uid=%u", opts->uid);
+ seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid));
if (opts->setgid)
- seq_printf(seq, ",gid=%u", opts->gid);
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid));
seq_printf(seq, ",mode=%03o", opts->mode);
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 90e5997262e..63dc19c54d5 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -310,6 +310,7 @@ void dlm_callback_resume(struct dlm_ls *ls)
}
mutex_unlock(&ls->ls_cb_mutex);
- log_debug(ls, "dlm_callback_resume %d", count);
+ if (count)
+ log_debug(ls, "dlm_callback_resume %d", count);
}
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 3a564d197e9..bc342f7ac3a 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -38,6 +38,7 @@
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/idr.h>
+#include <linux/ratelimit.h>
#include <asm/uaccess.h>
#include <linux/dlm.h>
@@ -74,6 +75,13 @@ do { \
(ls)->ls_name , ##args); \
} while (0)
+#define log_limit(ls, fmt, args...) \
+do { \
+ if (dlm_config.ci_log_debug) \
+ printk_ratelimited(KERN_DEBUG "dlm: %s: " fmt "\n", \
+ (ls)->ls_name , ##args); \
+} while (0)
+
#define DLM_ASSERT(x, do) \
{ \
if (!(x)) \
@@ -263,6 +271,8 @@ struct dlm_lkb {
ktime_t lkb_last_cast_time; /* for debugging */
ktime_t lkb_last_bast_time; /* for debugging */
+ uint64_t lkb_recover_seq; /* from ls_recover_seq */
+
char *lkb_lvbptr;
struct dlm_lksb *lkb_lksb; /* caller's status block */
void (*lkb_astfn) (void *astparam);
@@ -317,7 +327,7 @@ enum rsb_flags {
RSB_NEW_MASTER,
RSB_NEW_MASTER2,
RSB_RECOVER_CONVERT,
- RSB_LOCKS_PURGED,
+ RSB_RECOVER_GRANT,
};
static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
@@ -563,6 +573,7 @@ struct dlm_ls {
struct mutex ls_requestqueue_mutex;
struct dlm_rcom *ls_recover_buf;
int ls_recover_nodeid; /* for debugging */
+ unsigned int ls_recover_locks_in; /* for log info */
uint64_t ls_rcom_seq;
spinlock_t ls_rcom_spin;
struct list_head ls_recover_list;
@@ -589,6 +600,7 @@ struct dlm_ls {
#define LSFL_UEVENT_WAIT 5
#define LSFL_TIMEWARN 6
#define LSFL_CB_DELAY 7
+#define LSFL_NODIR 8
/* much of this is just saving user space pointers associated with the
lock that we pass back to the user lib with an ast */
@@ -636,7 +648,7 @@ static inline int dlm_recovery_stopped(struct dlm_ls *ls)
static inline int dlm_no_directory(struct dlm_ls *ls)
{
- return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
+ return test_bit(LSFL_NODIR, &ls->ls_flags);
}
int dlm_netlink_init(void);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 4c58d4a3adc..bdafb65a523 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -160,11 +160,12 @@ static const int __quecvt_compat_matrix[8][8] = {
void dlm_print_lkb(struct dlm_lkb *lkb)
{
- printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
- " status %d rqmode %d grmode %d wait_type %d\n",
+ printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x "
+ "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n",
lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
- lkb->lkb_grmode, lkb->lkb_wait_type);
+ lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid,
+ (unsigned long long)lkb->lkb_recover_seq);
}
static void dlm_print_rsb(struct dlm_rsb *r)
@@ -251,8 +252,6 @@ static inline int is_process_copy(struct dlm_lkb *lkb)
static inline int is_master_copy(struct dlm_lkb *lkb)
{
- if (lkb->lkb_flags & DLM_IFL_MSTCPY)
- DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
}
@@ -479,6 +478,9 @@ static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
kref_get(&r->res_ref);
goto out;
}
+ if (error == -ENOTBLK)
+ goto out;
+
error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
if (error)
goto out;
@@ -586,6 +588,23 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
return error;
}
+static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash)
+{
+ struct rb_node *n;
+ struct dlm_rsb *r;
+ int i;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ spin_lock(&ls->ls_rsbtbl[i].lock);
+ for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
+ if (r->res_hash == hash)
+ dlm_dump_rsb(r);
+ }
+ spin_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+}
+
/* This is only called to add a reference when the code already holds
a valid reference to the rsb, so there's no need for locking. */
@@ -1064,8 +1083,9 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
goto out_del;
}
- log_error(ls, "remwait error %x reply %d flags %x no wait_type",
- lkb->lkb_id, mstype, lkb->lkb_flags);
+ log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait",
+ lkb->lkb_id, ms ? ms->m_header.h_nodeid : 0, lkb->lkb_remid,
+ mstype, lkb->lkb_flags);
return -1;
out_del:
@@ -1498,13 +1518,13 @@ static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
}
lkb->lkb_rqmode = DLM_LOCK_IV;
+ lkb->lkb_highbast = 0;
}
static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
set_lvb_lock(r, lkb);
_grant_lock(r, lkb);
- lkb->lkb_highbast = 0;
}
static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
@@ -1866,7 +1886,8 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
/* Returns the highest requested mode of all blocked conversions; sets
cw if there's a blocked conversion to DLM_LOCK_CW. */
-static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw)
+static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw,
+ unsigned int *count)
{
struct dlm_lkb *lkb, *s;
int hi, demoted, quit, grant_restart, demote_restart;
@@ -1885,6 +1906,8 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw)
if (can_be_granted(r, lkb, 0, &deadlk)) {
grant_lock_pending(r, lkb);
grant_restart = 1;
+ if (count)
+ (*count)++;
continue;
}
@@ -1918,14 +1941,17 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw)
return max_t(int, high, hi);
}
-static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw)
+static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw,
+ unsigned int *count)
{
struct dlm_lkb *lkb, *s;
list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
- if (can_be_granted(r, lkb, 0, NULL))
+ if (can_be_granted(r, lkb, 0, NULL)) {
grant_lock_pending(r, lkb);
- else {
+ if (count)
+ (*count)++;
+ } else {
high = max_t(int, lkb->lkb_rqmode, high);
if (lkb->lkb_rqmode == DLM_LOCK_CW)
*cw = 1;
@@ -1954,16 +1980,20 @@ static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw)
return 0;
}
-static void grant_pending_locks(struct dlm_rsb *r)
+static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count)
{
struct dlm_lkb *lkb, *s;
int high = DLM_LOCK_IV;
int cw = 0;
- DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
+ if (!is_master(r)) {
+ log_print("grant_pending_locks r nodeid %d", r->res_nodeid);
+ dlm_dump_rsb(r);
+ return;
+ }
- high = grant_pending_convert(r, high, &cw);
- high = grant_pending_wait(r, high, &cw);
+ high = grant_pending_convert(r, high, &cw, count);
+ high = grant_pending_wait(r, high, &cw, count);
if (high == DLM_LOCK_IV)
return;
@@ -2499,7 +2529,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
before we try again to grant this one. */
if (is_demoted(lkb)) {
- grant_pending_convert(r, DLM_LOCK_IV, NULL);
+ grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL);
if (_can_be_granted(r, lkb, 1)) {
grant_lock(r, lkb);
queue_cast(r, lkb, 0);
@@ -2527,7 +2557,7 @@ static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
{
switch (error) {
case 0:
- grant_pending_locks(r);
+ grant_pending_locks(r, NULL);
/* grant_pending_locks also sends basts */
break;
case -EAGAIN:
@@ -2550,7 +2580,7 @@ static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
int error)
{
- grant_pending_locks(r);
+ grant_pending_locks(r, NULL);
}
/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
@@ -2571,7 +2601,7 @@ static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
int error)
{
if (error)
- grant_pending_locks(r);
+ grant_pending_locks(r, NULL);
}
/*
@@ -3372,7 +3402,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
return error;
}
-static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -3412,14 +3442,15 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
error = 0;
if (error)
dlm_put_lkb(lkb);
- return;
+ return 0;
fail:
setup_stub_lkb(ls, ms);
send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+ return error;
}
-static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -3429,6 +3460,15 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
if (error)
goto fail;
+ if (lkb->lkb_remid != ms->m_lkid) {
+ log_error(ls, "receive_convert %x remid %x recover_seq %llu "
+ "remote %d %x", lkb->lkb_id, lkb->lkb_remid,
+ (unsigned long long)lkb->lkb_recover_seq,
+ ms->m_header.h_nodeid, ms->m_lkid);
+ error = -ENOENT;
+ goto fail;
+ }
+
r = lkb->lkb_resource;
hold_rsb(r);
@@ -3456,14 +3496,15 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
- return;
+ return 0;
fail:
setup_stub_lkb(ls, ms);
send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+ return error;
}
-static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -3473,6 +3514,14 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
if (error)
goto fail;
+ if (lkb->lkb_remid != ms->m_lkid) {
+ log_error(ls, "receive_unlock %x remid %x remote %d %x",
+ lkb->lkb_id, lkb->lkb_remid,
+ ms->m_header.h_nodeid, ms->m_lkid);
+ error = -ENOENT;
+ goto fail;
+ }
+
r = lkb->lkb_resource;
hold_rsb(r);
@@ -3497,14 +3546,15 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
- return;
+ return 0;
fail:
setup_stub_lkb(ls, ms);
send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+ return error;
}
-static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -3532,25 +3582,23 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
- return;
+ return 0;
fail:
setup_stub_lkb(ls, ms);
send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+ return error;
}
-static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
- if (error) {
- log_debug(ls, "receive_grant from %d no lkb %x",
- ms->m_header.h_nodeid, ms->m_remid);
- return;
- }
+ if (error)
+ return error;
r = lkb->lkb_resource;
@@ -3570,20 +3618,18 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
+ return 0;
}
-static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
- if (error) {
- log_debug(ls, "receive_bast from %d no lkb %x",
- ms->m_header.h_nodeid, ms->m_remid);
- return;
- }
+ if (error)
+ return error;
r = lkb->lkb_resource;
@@ -3595,10 +3641,12 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
goto out;
queue_bast(r, lkb, ms->m_bastmode);
+ lkb->lkb_highbast = ms->m_bastmode;
out:
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
+ return 0;
}
static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
@@ -3653,18 +3701,15 @@ static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
do_purge(ls, ms->m_nodeid, ms->m_pid);
}
-static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error, mstype, result;
error = find_lkb(ls, ms->m_remid, &lkb);
- if (error) {
- log_debug(ls, "receive_request_reply from %d no lkb %x",
- ms->m_header.h_nodeid, ms->m_remid);
- return;
- }
+ if (error)
+ return error;
r = lkb->lkb_resource;
hold_rsb(r);
@@ -3676,8 +3721,13 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
mstype = lkb->lkb_wait_type;
error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
- if (error)
+ if (error) {
+ log_error(ls, "receive_request_reply %x remote %d %x result %d",
+ lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid,
+ ms->m_result);
+ dlm_dump_rsb(r);
goto out;
+ }
/* Optimization: the dir node was also the master, so it took our
lookup as a request and sent request reply instead of lookup reply */
@@ -3755,6 +3805,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
+ return 0;
}
static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
@@ -3793,8 +3844,11 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
break;
default:
- log_error(r->res_ls, "receive_convert_reply %x error %d",
- lkb->lkb_id, ms->m_result);
+ log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d",
+ lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid,
+ ms->m_result);
+ dlm_print_rsb(r);
+ dlm_print_lkb(lkb);
}
}
@@ -3821,20 +3875,18 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
put_rsb(r);
}
-static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
- if (error) {
- log_debug(ls, "receive_convert_reply from %d no lkb %x",
- ms->m_header.h_nodeid, ms->m_remid);
- return;
- }
+ if (error)
+ return error;
_receive_convert_reply(lkb, ms);
dlm_put_lkb(lkb);
+ return 0;
}
static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
@@ -3873,20 +3925,18 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
put_rsb(r);
}
-static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
- if (error) {
- log_debug(ls, "receive_unlock_reply from %d no lkb %x",
- ms->m_header.h_nodeid, ms->m_remid);
- return;
- }
+ if (error)
+ return error;
_receive_unlock_reply(lkb, ms);
dlm_put_lkb(lkb);
+ return 0;
}
static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
@@ -3925,20 +3975,18 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
put_rsb(r);
}
-static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
- if (error) {
- log_debug(ls, "receive_cancel_reply from %d no lkb %x",
- ms->m_header.h_nodeid, ms->m_remid);
- return;
- }
+ if (error)
+ return error;
_receive_cancel_reply(lkb, ms);
dlm_put_lkb(lkb);
+ return 0;
}
static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
@@ -3949,7 +3997,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
error = find_lkb(ls, ms->m_lkid, &lkb);
if (error) {
- log_error(ls, "receive_lookup_reply no lkb");
+ log_error(ls, "receive_lookup_reply no lkid %x", ms->m_lkid);
return;
}
@@ -3993,8 +4041,11 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
dlm_put_lkb(lkb);
}
-static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
+static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
+ uint32_t saved_seq)
{
+ int error = 0, noent = 0;
+
if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
log_debug(ls, "ignore non-member message %d from %d %x %x %d",
ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
@@ -4007,47 +4058,50 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
/* messages sent to a master node */
case DLM_MSG_REQUEST:
- receive_request(ls, ms);
+ error = receive_request(ls, ms);
break;
case DLM_MSG_CONVERT:
- receive_convert(ls, ms);
+ error = receive_convert(ls, ms);
break;
case DLM_MSG_UNLOCK:
- receive_unlock(ls, ms);
+ error = receive_unlock(ls, ms);
break;
case DLM_MSG_CANCEL:
- receive_cancel(ls, ms);
+ noent = 1;
+ error = receive_cancel(ls, ms);
break;
/* messages sent from a master node (replies to above) */
case DLM_MSG_REQUEST_REPLY:
- receive_request_reply(ls, ms);
+ error = receive_request_reply(ls, ms);
break;
case DLM_MSG_CONVERT_REPLY:
- receive_convert_reply(ls, ms);
+ error = receive_convert_reply(ls, ms);
break;
case DLM_MSG_UNLOCK_REPLY:
- receive_unlock_reply(ls, ms);
+ error = receive_unlock_reply(ls, ms);
break;
case DLM_MSG_CANCEL_REPLY:
- receive_cancel_reply(ls, ms);
+ error = receive_cancel_reply(ls, ms);
break;
/* messages sent from a master node (only two types of async msg) */
case DLM_MSG_GRANT:
- receive_grant(ls, ms);
+ noent = 1;
+ error = receive_grant(ls, ms);
break;
case DLM_MSG_BAST:
- receive_bast(ls, ms);
+ noent = 1;
+ error = receive_bast(ls, ms);
break;
/* messages sent to a dir node */
@@ -4075,6 +4129,37 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
default:
log_error(ls, "unknown message type %d", ms->m_type);
}
+
+ /*
+ * When checking for ENOENT, we're checking the result of
+ * find_lkb(m_remid):
+ *
+ * The lock id referenced in the message wasn't found. This may
+ * happen in normal usage for the async messages and cancel, so
+ * only use log_debug for them.
+ *
+ * Some errors are expected and normal.
+ */
+
+ if (error == -ENOENT && noent) {
+ log_debug(ls, "receive %d no %x remote %d %x saved_seq %u",
+ ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
+ ms->m_lkid, saved_seq);
+ } else if (error == -ENOENT) {
+ log_error(ls, "receive %d no %x remote %d %x saved_seq %u",
+ ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
+ ms->m_lkid, saved_seq);
+
+ if (ms->m_type == DLM_MSG_CONVERT)
+ dlm_dump_rsb_hash(ls, ms->m_hash);
+ }
+
+ if (error == -EINVAL) {
+ log_error(ls, "receive %d inval from %d lkid %x remid %x "
+ "saved_seq %u",
+ ms->m_type, ms->m_header.h_nodeid,
+ ms->m_lkid, ms->m_remid, saved_seq);
+ }
}
/* If the lockspace is in recovery mode (locking stopped), then normal
@@ -4092,16 +4177,17 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
dlm_add_requestqueue(ls, nodeid, ms);
} else {
dlm_wait_requestqueue(ls);
- _receive_message(ls, ms);
+ _receive_message(ls, ms, 0);
}
}
/* This is called by dlm_recoverd to process messages that were saved on
the requestqueue. */
-void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms)
+void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms,
+ uint32_t saved_seq)
{
- _receive_message(ls, ms);
+ _receive_message(ls, ms, saved_seq);
}
/* This is called by the midcomms layer when something is received for
@@ -4137,9 +4223,11 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
ls = dlm_find_lockspace_global(hd->h_lockspace);
if (!ls) {
- if (dlm_config.ci_log_debug)
- log_print("invalid lockspace %x from %d cmd %d type %d",
- hd->h_lockspace, nodeid, hd->h_cmd, type);
+ if (dlm_config.ci_log_debug) {
+ printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace "
+ "%u from %d cmd %d type %d\n",
+ hd->h_lockspace, nodeid, hd->h_cmd, type);
+ }
if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
dlm_send_ls_not_ready(nodeid, &p->rcom);
@@ -4187,15 +4275,13 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
/* A waiting lkb needs recovery if the master node has failed, or
the master node is changing (only when no directory is used) */
-static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ int dir_nodeid)
{
- if (dlm_is_removed(ls, lkb->lkb_nodeid))
+ if (dlm_no_directory(ls))
return 1;
- if (!dlm_no_directory(ls))
- return 0;
-
- if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
+ if (dlm_is_removed(ls, lkb->lkb_wait_nodeid))
return 1;
return 0;
@@ -4212,6 +4298,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
struct dlm_lkb *lkb, *safe;
struct dlm_message *ms_stub;
int wait_type, stub_unlock_result, stub_cancel_result;
+ int dir_nodeid;
ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL);
if (!ms_stub) {
@@ -4223,13 +4310,21 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+ dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource);
+
/* exclude debug messages about unlocks because there can be so
many and they aren't very interesting */
if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
- log_debug(ls, "recover_waiter %x nodeid %d "
- "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
- lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
+ log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
+ "lkb_nodeid %d wait_nodeid %d dir_nodeid %d",
+ lkb->lkb_id,
+ lkb->lkb_remid,
+ lkb->lkb_wait_type,
+ lkb->lkb_resource->res_nodeid,
+ lkb->lkb_nodeid,
+ lkb->lkb_wait_nodeid,
+ dir_nodeid);
}
/* all outstanding lookups, regardless of destination will be
@@ -4240,7 +4335,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
continue;
}
- if (!waiter_needs_recovery(ls, lkb))
+ if (!waiter_needs_recovery(ls, lkb, dir_nodeid))
continue;
wait_type = lkb->lkb_wait_type;
@@ -4373,8 +4468,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
ou = is_overlap_unlock(lkb);
err = 0;
- log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
- lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
+ log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
+ "lkb_nodeid %d wait_nodeid %d dir_nodeid %d "
+ "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype,
+ r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
+ dlm_dir_nodeid(r), oc, ou);
/* At this point we assume that we won't get a reply to any
previous op or overlap op on this lock. First, do a big
@@ -4426,9 +4524,12 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
}
}
- if (err)
- log_error(ls, "recover_waiters_post %x %d %x %d %d",
- lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou);
+ if (err) {
+ log_error(ls, "waiter %x msg %d r_nodeid %d "
+ "dir_nodeid %d overlap %d %d",
+ lkb->lkb_id, mstype, r->res_nodeid,
+ dlm_dir_nodeid(r), oc, ou);
+ }
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
@@ -4437,112 +4538,177 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
return error;
}
-static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
- int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
+static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r,
+ struct list_head *list)
{
- struct dlm_ls *ls = r->res_ls;
struct dlm_lkb *lkb, *safe;
- list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
- if (test(ls, lkb)) {
- rsb_set_flag(r, RSB_LOCKS_PURGED);
- del_lkb(r, lkb);
- /* this put should free the lkb */
- if (!dlm_put_lkb(lkb))
- log_error(ls, "purged lkb not released");
- }
+ list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
+ if (!is_master_copy(lkb))
+ continue;
+
+ /* don't purge lkbs we've added in recover_master_copy for
+ the current recovery seq */
+
+ if (lkb->lkb_recover_seq == ls->ls_recover_seq)
+ continue;
+
+ del_lkb(r, lkb);
+
+ /* this put should free the lkb */
+ if (!dlm_put_lkb(lkb))
+ log_error(ls, "purged mstcpy lkb not released");
}
}
-static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
{
- return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
-}
+ struct dlm_ls *ls = r->res_ls;
-static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
-{
- return is_master_copy(lkb);
+ purge_mstcpy_list(ls, r, &r->res_grantqueue);
+ purge_mstcpy_list(ls, r, &r->res_convertqueue);
+ purge_mstcpy_list(ls, r, &r->res_waitqueue);
}
-static void purge_dead_locks(struct dlm_rsb *r)
+static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
+ struct list_head *list,
+ int nodeid_gone, unsigned int *count)
{
- purge_queue(r, &r->res_grantqueue, &purge_dead_test);
- purge_queue(r, &r->res_convertqueue, &purge_dead_test);
- purge_queue(r, &r->res_waitqueue, &purge_dead_test);
-}
+ struct dlm_lkb *lkb, *safe;
-void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
-{
- purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
- purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
- purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
+ list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
+ if (!is_master_copy(lkb))
+ continue;
+
+ if ((lkb->lkb_nodeid == nodeid_gone) ||
+ dlm_is_removed(ls, lkb->lkb_nodeid)) {
+
+ del_lkb(r, lkb);
+
+ /* this put should free the lkb */
+ if (!dlm_put_lkb(lkb))
+ log_error(ls, "purged dead lkb not released");
+
+ rsb_set_flag(r, RSB_RECOVER_GRANT);
+
+ (*count)++;
+ }
+ }
}
/* Get rid of locks held by nodes that are gone. */
-int dlm_purge_locks(struct dlm_ls *ls)
+void dlm_recover_purge(struct dlm_ls *ls)
{
struct dlm_rsb *r;
+ struct dlm_member *memb;
+ int nodes_count = 0;
+ int nodeid_gone = 0;
+ unsigned int lkb_count = 0;
- log_debug(ls, "dlm_purge_locks");
+ /* cache one removed nodeid to optimize the common
+ case of a single node removed */
+
+ list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+ nodes_count++;
+ nodeid_gone = memb->nodeid;
+ }
+
+ if (!nodes_count)
+ return;
down_write(&ls->ls_root_sem);
list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
hold_rsb(r);
lock_rsb(r);
- if (is_master(r))
- purge_dead_locks(r);
+ if (is_master(r)) {
+ purge_dead_list(ls, r, &r->res_grantqueue,
+ nodeid_gone, &lkb_count);
+ purge_dead_list(ls, r, &r->res_convertqueue,
+ nodeid_gone, &lkb_count);
+ purge_dead_list(ls, r, &r->res_waitqueue,
+ nodeid_gone, &lkb_count);
+ }
unlock_rsb(r);
unhold_rsb(r);
-
- schedule();
+ cond_resched();
}
up_write(&ls->ls_root_sem);
- return 0;
+ if (lkb_count)
+ log_debug(ls, "dlm_recover_purge %u locks for %u nodes",
+ lkb_count, nodes_count);
}
-static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
+static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket)
{
struct rb_node *n;
- struct dlm_rsb *r, *r_ret = NULL;
+ struct dlm_rsb *r;
spin_lock(&ls->ls_rsbtbl[bucket].lock);
for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
r = rb_entry(n, struct dlm_rsb, res_hashnode);
- if (!rsb_flag(r, RSB_LOCKS_PURGED))
+
+ if (!rsb_flag(r, RSB_RECOVER_GRANT))
+ continue;
+ rsb_clear_flag(r, RSB_RECOVER_GRANT);
+ if (!is_master(r))
continue;
hold_rsb(r);
- rsb_clear_flag(r, RSB_LOCKS_PURGED);
- r_ret = r;
- break;
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ return r;
}
spin_unlock(&ls->ls_rsbtbl[bucket].lock);
- return r_ret;
+ return NULL;
}
-void dlm_grant_after_purge(struct dlm_ls *ls)
+/*
+ * Attempt to grant locks on resources that we are the master of.
+ * Locks may have become grantable during recovery because locks
+ * from departed nodes have been purged (or not rebuilt), allowing
+ * previously blocked locks to now be granted. The subset of rsb's
+ * we are interested in are those with lkb's on either the convert or
+ * waiting queues.
+ *
+ * Simplest would be to go through each master rsb and check for non-empty
+ * convert or waiting queues, and attempt to grant on those rsbs.
+ * Checking the queues requires lock_rsb, though, for which we'd need
+ * to release the rsbtbl lock. This would make iterating through all
+ * rsb's very inefficient. So, we rely on earlier recovery routines
+ * to set RECOVER_GRANT on any rsb's that we should attempt to grant
+ * locks for.
+ */
+
+void dlm_recover_grant(struct dlm_ls *ls)
{
struct dlm_rsb *r;
int bucket = 0;
+ unsigned int count = 0;
+ unsigned int rsb_count = 0;
+ unsigned int lkb_count = 0;
while (1) {
- r = find_purged_rsb(ls, bucket);
+ r = find_grant_rsb(ls, bucket);
if (!r) {
if (bucket == ls->ls_rsbtbl_size - 1)
break;
bucket++;
continue;
}
+ rsb_count++;
+ count = 0;
lock_rsb(r);
- if (is_master(r)) {
- grant_pending_locks(r);
- confirm_master(r, 0);
- }
+ grant_pending_locks(r, &count);
+ lkb_count += count;
+ confirm_master(r, 0);
unlock_rsb(r);
put_rsb(r);
- schedule();
+ cond_resched();
}
+
+ if (lkb_count)
+ log_debug(ls, "dlm_recover_grant %u locks on %u resources",
+ lkb_count, rsb_count);
}
static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
@@ -4631,6 +4797,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
struct dlm_rsb *r;
struct dlm_lkb *lkb;
+ uint32_t remid = 0;
int error;
if (rl->rl_parent_lkid) {
@@ -4638,14 +4805,31 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
goto out;
}
- error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen),
- R_MASTER, &r);
+ remid = le32_to_cpu(rl->rl_lkid);
+
+ /* In general we expect the rsb returned to be R_MASTER, but we don't
+ have to require it. Recovery of masters on one node can overlap
+ recovery of locks on another node, so one node can send us MSTCPY
+ locks before we've made ourselves master of this rsb. We can still
+ add new MSTCPY locks that we receive here without any harm; when
+ we make ourselves master, dlm_recover_masters() won't touch the
+ MSTCPY locks we've received early. */
+
+ error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), 0, &r);
if (error)
goto out;
+ if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) {
+ log_error(ls, "dlm_recover_master_copy remote %d %x not dir",
+ rc->rc_header.h_nodeid, remid);
+ error = -EBADR;
+ put_rsb(r);
+ goto out;
+ }
+
lock_rsb(r);
- lkb = search_remid(r, rc->rc_header.h_nodeid, le32_to_cpu(rl->rl_lkid));
+ lkb = search_remid(r, rc->rc_header.h_nodeid, remid);
if (lkb) {
error = -EEXIST;
goto out_remid;
@@ -4664,19 +4848,25 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
attach_lkb(r, lkb);
add_lkb(r, lkb, rl->rl_status);
error = 0;
+ ls->ls_recover_locks_in++;
+
+ if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+ rsb_set_flag(r, RSB_RECOVER_GRANT);
out_remid:
/* this is the new value returned to the lock holder for
saving in its process-copy lkb */
rl->rl_remid = cpu_to_le32(lkb->lkb_id);
+ lkb->lkb_recover_seq = ls->ls_recover_seq;
+
out_unlock:
unlock_rsb(r);
put_rsb(r);
out:
- if (error)
- log_debug(ls, "recover_master_copy %d %x", error,
- le32_to_cpu(rl->rl_lkid));
+ if (error && error != -EEXIST)
+ log_debug(ls, "dlm_recover_master_copy remote %d %x error %d",
+ rc->rc_header.h_nodeid, remid, error);
rl->rl_result = cpu_to_le32(error);
return error;
}
@@ -4687,41 +4877,52 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
struct dlm_rsb *r;
struct dlm_lkb *lkb;
- int error;
+ uint32_t lkid, remid;
+ int error, result;
+
+ lkid = le32_to_cpu(rl->rl_lkid);
+ remid = le32_to_cpu(rl->rl_remid);
+ result = le32_to_cpu(rl->rl_result);
- error = find_lkb(ls, le32_to_cpu(rl->rl_lkid), &lkb);
+ error = find_lkb(ls, lkid, &lkb);
if (error) {
- log_error(ls, "recover_process_copy no lkid %x",
- le32_to_cpu(rl->rl_lkid));
+ log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d",
+ lkid, rc->rc_header.h_nodeid, remid, result);
return error;
}
- DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
-
- error = le32_to_cpu(rl->rl_result);
-
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
- switch (error) {
+ if (!is_process_copy(lkb)) {
+ log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d",
+ lkid, rc->rc_header.h_nodeid, remid, result);
+ dlm_dump_rsb(r);
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return -EINVAL;
+ }
+
+ switch (result) {
case -EBADR:
/* There's a chance the new master received our lock before
dlm_recover_master_reply(), this wouldn't happen if we did
a barrier between recover_masters and recover_locks. */
- log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id,
- (unsigned long)r, r->res_name);
+
+ log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d",
+ lkid, rc->rc_header.h_nodeid, remid, result);
+
dlm_send_rcom_lock(r, lkb);
goto out;
case -EEXIST:
- log_debug(ls, "master copy exists %x", lkb->lkb_id);
- /* fall through */
case 0:
- lkb->lkb_remid = le32_to_cpu(rl->rl_remid);
+ lkb->lkb_remid = remid;
break;
default:
- log_error(ls, "dlm_recover_process_copy unknown error %d %x",
- error, lkb->lkb_id);
+ log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk",
+ lkid, rc->rc_header.h_nodeid, remid, result);
}
/* an ack for dlm_recover_locks() which waits for replies from
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 1a255307f6f..c8b226c6280 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -15,7 +15,8 @@
void dlm_dump_rsb(struct dlm_rsb *r);
void dlm_print_lkb(struct dlm_lkb *lkb);
-void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
+void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms,
+ uint32_t saved_seq);
void dlm_receive_buffer(union dlm_packet *p, int nodeid);
int dlm_modes_compat(int mode1, int mode2);
void dlm_put_rsb(struct dlm_rsb *r);
@@ -31,9 +32,9 @@ void dlm_adjust_timeouts(struct dlm_ls *ls);
int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
unsigned int flags, struct dlm_rsb **r_ret);
-int dlm_purge_locks(struct dlm_ls *ls);
+void dlm_recover_purge(struct dlm_ls *ls);
void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
-void dlm_grant_after_purge(struct dlm_ls *ls);
+void dlm_recover_grant(struct dlm_ls *ls);
int dlm_recover_waiters_post(struct dlm_ls *ls);
void dlm_recover_waiters_pre(struct dlm_ls *ls);
int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index a1ea25face8..ca506abbdd3 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -74,6 +74,19 @@ static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
return len;
}
+static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", dlm_no_directory(ls));
+}
+
+static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ int val = simple_strtoul(buf, NULL, 0);
+ if (val == 1)
+ set_bit(LSFL_NODIR, &ls->ls_flags);
+ return len;
+}
+
static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
{
uint32_t status = dlm_recover_status(ls);
@@ -107,6 +120,12 @@ static struct dlm_attr dlm_attr_id = {
.store = dlm_id_store
};
+static struct dlm_attr dlm_attr_nodir = {
+ .attr = {.name = "nodir", .mode = S_IRUGO | S_IWUSR},
+ .show = dlm_nodir_show,
+ .store = dlm_nodir_store
+};
+
static struct dlm_attr dlm_attr_recover_status = {
.attr = {.name = "recover_status", .mode = S_IRUGO},
.show = dlm_recover_status_show
@@ -121,6 +140,7 @@ static struct attribute *dlm_attrs[] = {
&dlm_attr_control.attr,
&dlm_attr_event.attr,
&dlm_attr_id.attr,
+ &dlm_attr_nodir.attr,
&dlm_attr_recover_status.attr,
&dlm_attr_recover_nodeid.attr,
NULL,
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 133ef6dc7cb..5c1b0e38c7a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -142,6 +142,7 @@ struct writequeue_entry {
static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
static int dlm_local_count;
+static int dlm_allow_conn;
/* Work queues */
static struct workqueue_struct *recv_workqueue;
@@ -710,6 +711,13 @@ static int tcp_accept_from_sock(struct connection *con)
struct connection *newcon;
struct connection *addcon;
+ mutex_lock(&connections_lock);
+ if (!dlm_allow_conn) {
+ mutex_unlock(&connections_lock);
+ return -1;
+ }
+ mutex_unlock(&connections_lock);
+
memset(&peeraddr, 0, sizeof(peeraddr));
result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
IPPROTO_TCP, &newsock);
@@ -1503,6 +1511,7 @@ void dlm_lowcomms_stop(void)
socket activity.
*/
mutex_lock(&connections_lock);
+ dlm_allow_conn = 0;
foreach_conn(stop_conn);
mutex_unlock(&connections_lock);
@@ -1530,7 +1539,7 @@ int dlm_lowcomms_start(void)
if (!dlm_local_count) {
error = -ENOTCONN;
log_print("no local IP address has been set");
- goto out;
+ goto fail;
}
error = -ENOMEM;
@@ -1538,7 +1547,13 @@ int dlm_lowcomms_start(void)
__alignof__(struct connection), 0,
NULL);
if (!con_cache)
- goto out;
+ goto fail;
+
+ error = work_start();
+ if (error)
+ goto fail_destroy;
+
+ dlm_allow_conn = 1;
/* Start listening */
if (dlm_config.ci_protocol == 0)
@@ -1548,20 +1563,17 @@ int dlm_lowcomms_start(void)
if (error)
goto fail_unlisten;
- error = work_start();
- if (error)
- goto fail_unlisten;
-
return 0;
fail_unlisten:
+ dlm_allow_conn = 0;
con = nodeid2con(0,0);
if (con) {
close_connection(con, false);
kmem_cache_free(con_cache, con);
}
+fail_destroy:
kmem_cache_destroy(con_cache);
-
-out:
+fail:
return error;
}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index da64df7576e..7cd24bccd4f 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -21,21 +21,19 @@ static struct kmem_cache *rsb_cache;
int __init dlm_memory_init(void)
{
- int ret = 0;
-
lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
__alignof__(struct dlm_lkb), 0, NULL);
if (!lkb_cache)
- ret = -ENOMEM;
+ return -ENOMEM;
rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb),
__alignof__(struct dlm_rsb), 0, NULL);
if (!rsb_cache) {
kmem_cache_destroy(lkb_cache);
- ret = -ENOMEM;
+ return -ENOMEM;
}
- return ret;
+ return 0;
}
void dlm_memory_exit(void)
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index ac5c616c969..64d3e2b958c 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -486,47 +486,50 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
return 0;
}
-static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+/* Called by dlm_recv; corresponds to dlm_receive_message() but special
+ recovery-only comms are sent through here. */
+
+void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
{
+ int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock);
+ int stop, reply = 0, lock = 0;
+ uint32_t status;
uint64_t seq;
- int rv = 0;
switch (rc->rc_type) {
+ case DLM_RCOM_LOCK:
+ lock = 1;
+ break;
+ case DLM_RCOM_LOCK_REPLY:
+ lock = 1;
+ reply = 1;
+ break;
case DLM_RCOM_STATUS_REPLY:
case DLM_RCOM_NAMES_REPLY:
case DLM_RCOM_LOOKUP_REPLY:
- case DLM_RCOM_LOCK_REPLY:
- spin_lock(&ls->ls_recover_lock);
- seq = ls->ls_recover_seq;
- spin_unlock(&ls->ls_recover_lock);
- if (rc->rc_seq_reply != seq) {
- log_debug(ls, "ignoring old reply %x from %d "
- "seq_reply %llx expect %llx",
- rc->rc_type, rc->rc_header.h_nodeid,
- (unsigned long long)rc->rc_seq_reply,
- (unsigned long long)seq);
- rv = 1;
- }
- }
- return rv;
-}
-
-/* Called by dlm_recv; corresponds to dlm_receive_message() but special
- recovery-only comms are sent through here. */
+ reply = 1;
+ };
-void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
-{
- int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock);
+ spin_lock(&ls->ls_recover_lock);
+ status = ls->ls_recover_status;
+ stop = test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+ seq = ls->ls_recover_seq;
+ spin_unlock(&ls->ls_recover_lock);
- if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
- log_debug(ls, "ignoring recovery message %x from %d",
- rc->rc_type, nodeid);
+ if ((stop && (rc->rc_type != DLM_RCOM_STATUS)) ||
+ (reply && (rc->rc_seq_reply != seq)) ||
+ (lock && !(status & DLM_RS_DIR))) {
+ log_limit(ls, "dlm_receive_rcom ignore msg %d "
+ "from %d %llu %llu recover seq %llu sts %x gen %u",
+ rc->rc_type,
+ nodeid,
+ (unsigned long long)rc->rc_seq,
+ (unsigned long long)rc->rc_seq_reply,
+ (unsigned long long)seq,
+ status, ls->ls_generation);
goto out;
}
- if (is_old_reply(ls, rc))
- goto out;
-
switch (rc->rc_type) {
case DLM_RCOM_STATUS:
receive_rcom_status(ls, rc);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 34d5adf1fce..7554e4dac6b 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -339,9 +339,12 @@ static void set_lock_master(struct list_head *queue, int nodeid)
{
struct dlm_lkb *lkb;
- list_for_each_entry(lkb, queue, lkb_statequeue)
- if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
+ list_for_each_entry(lkb, queue, lkb_statequeue) {
+ if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) {
lkb->lkb_nodeid = nodeid;
+ lkb->lkb_remid = 0;
+ }
+ }
}
static void set_master_lkbs(struct dlm_rsb *r)
@@ -354,18 +357,16 @@ static void set_master_lkbs(struct dlm_rsb *r)
/*
* Propagate the new master nodeid to locks
* The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
- * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
+ * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which
* rsb's to consider.
*/
static void set_new_master(struct dlm_rsb *r, int nodeid)
{
- lock_rsb(r);
r->res_nodeid = nodeid;
set_master_lkbs(r);
rsb_set_flag(r, RSB_NEW_MASTER);
rsb_set_flag(r, RSB_NEW_MASTER2);
- unlock_rsb(r);
}
/*
@@ -376,9 +377,9 @@ static void set_new_master(struct dlm_rsb *r, int nodeid)
static int recover_master(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
- int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
-
- dir_nodeid = dlm_dir_nodeid(r);
+ int error, ret_nodeid;
+ int our_nodeid = dlm_our_nodeid();
+ int dir_nodeid = dlm_dir_nodeid(r);
if (dir_nodeid == our_nodeid) {
error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
@@ -388,7 +389,9 @@ static int recover_master(struct dlm_rsb *r)
if (ret_nodeid == our_nodeid)
ret_nodeid = 0;
+ lock_rsb(r);
set_new_master(r, ret_nodeid);
+ unlock_rsb(r);
} else {
recover_list_add(r);
error = dlm_send_rcom_lookup(r, dir_nodeid);
@@ -398,24 +401,33 @@ static int recover_master(struct dlm_rsb *r)
}
/*
- * When not using a directory, most resource names will hash to a new static
- * master nodeid and the resource will need to be remastered.
+ * All MSTCPY locks are purged and rebuilt, even if the master stayed the same.
+ * This is necessary because recovery can be started, aborted and restarted,
+ * causing the master nodeid to briefly change during the aborted recovery, and
+ * change back to the original value in the second recovery. The MSTCPY locks
+ * may or may not have been purged during the aborted recovery. Another node
+ * with an outstanding request in waiters list and a request reply saved in the
+ * requestqueue, cannot know whether it should ignore the reply and resend the
+ * request, or accept the reply and complete the request. It must do the
+ * former if the remote node purged MSTCPY locks, and it must do the later if
+ * the remote node did not. This is solved by always purging MSTCPY locks, in
+ * which case, the request reply would always be ignored and the request
+ * resent.
*/
static int recover_master_static(struct dlm_rsb *r)
{
- int master = dlm_dir_nodeid(r);
+ int dir_nodeid = dlm_dir_nodeid(r);
+ int new_master = dir_nodeid;
- if (master == dlm_our_nodeid())
- master = 0;
+ if (dir_nodeid == dlm_our_nodeid())
+ new_master = 0;
- if (r->res_nodeid != master) {
- if (is_master(r))
- dlm_purge_mstcpy_locks(r);
- set_new_master(r, master);
- return 1;
- }
- return 0;
+ lock_rsb(r);
+ dlm_purge_mstcpy_locks(r);
+ set_new_master(r, new_master);
+ unlock_rsb(r);
+ return 1;
}
/*
@@ -481,7 +493,9 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
if (nodeid == dlm_our_nodeid())
nodeid = 0;
+ lock_rsb(r);
set_new_master(r, nodeid);
+ unlock_rsb(r);
recover_list_del(r);
if (recover_list_empty(ls))
@@ -556,8 +570,6 @@ int dlm_recover_locks(struct dlm_ls *ls)
struct dlm_rsb *r;
int error, count = 0;
- log_debug(ls, "dlm_recover_locks");
-
down_read(&ls->ls_root_sem);
list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
if (is_master(r)) {
@@ -584,7 +596,7 @@ int dlm_recover_locks(struct dlm_ls *ls)
}
up_read(&ls->ls_root_sem);
- log_debug(ls, "dlm_recover_locks %d locks", count);
+ log_debug(ls, "dlm_recover_locks %d out", count);
error = dlm_wait_function(ls, &recover_list_empty);
out:
@@ -721,21 +733,19 @@ static void recover_conversion(struct dlm_rsb *r)
}
/* We've become the new master for this rsb and waiting/converting locks may
- need to be granted in dlm_grant_after_purge() due to locks that may have
+ need to be granted in dlm_recover_grant() due to locks that may have
existed from a removed node. */
-static void set_locks_purged(struct dlm_rsb *r)
+static void recover_grant(struct dlm_rsb *r)
{
if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
- rsb_set_flag(r, RSB_LOCKS_PURGED);
+ rsb_set_flag(r, RSB_RECOVER_GRANT);
}
void dlm_recover_rsbs(struct dlm_ls *ls)
{
struct dlm_rsb *r;
- int count = 0;
-
- log_debug(ls, "dlm_recover_rsbs");
+ unsigned int count = 0;
down_read(&ls->ls_root_sem);
list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
@@ -744,7 +754,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
if (rsb_flag(r, RSB_RECOVER_CONVERT))
recover_conversion(r);
if (rsb_flag(r, RSB_NEW_MASTER2))
- set_locks_purged(r);
+ recover_grant(r);
recover_lvb(r);
count++;
}
@@ -754,7 +764,8 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
}
up_read(&ls->ls_root_sem);
- log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
+ if (count)
+ log_debug(ls, "dlm_recover_rsbs %d done", count);
}
/* Create a single list of all root rsb's to be used during recovery */
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 3780caf7ae0..f1a9073c083 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
unsigned long start;
int error, neg = 0;
- log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq);
+ log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
mutex_lock(&ls->ls_recoverd_active);
@@ -84,6 +84,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
goto fail;
}
+ ls->ls_recover_locks_in = 0;
+
dlm_set_recover_status(ls, DLM_RS_NODES);
error = dlm_recover_members_wait(ls);
@@ -130,7 +132,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
* Clear lkb's for departed nodes.
*/
- dlm_purge_locks(ls);
+ dlm_recover_purge(ls);
/*
* Get new master nodeid's for rsb's that were mastered on
@@ -161,6 +163,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
goto fail;
}
+ log_debug(ls, "dlm_recover_locks %u in",
+ ls->ls_recover_locks_in);
+
/*
* Finalize state in master rsb's now that all locks can be
* checked. This includes conversion resolution and lvb
@@ -225,9 +230,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
goto fail;
}
- dlm_grant_after_purge(ls);
+ dlm_recover_grant(ls);
- log_debug(ls, "dlm_recover %llx generation %u done: %u ms",
+ log_debug(ls, "dlm_recover %llu generation %u done: %u ms",
(unsigned long long)rv->seq, ls->ls_generation,
jiffies_to_msecs(jiffies - start));
mutex_unlock(&ls->ls_recoverd_active);
@@ -237,7 +242,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
fail:
dlm_release_root_list(ls);
- log_debug(ls, "dlm_recover %llx error %d",
+ log_debug(ls, "dlm_recover %llu error %d",
(unsigned long long)rv->seq, error);
mutex_unlock(&ls->ls_recoverd_active);
return error;
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index a44fa22890e..1695f1b0dd4 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -19,6 +19,7 @@
struct rq_entry {
struct list_head list;
+ uint32_t recover_seq;
int nodeid;
struct dlm_message request;
};
@@ -41,6 +42,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
return;
}
+ e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
e->nodeid = nodeid;
memcpy(&e->request, ms, ms->m_header.h_length);
@@ -63,6 +65,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
int dlm_process_requestqueue(struct dlm_ls *ls)
{
struct rq_entry *e;
+ struct dlm_message *ms;
int error = 0;
mutex_lock(&ls->ls_requestqueue_mutex);
@@ -76,7 +79,15 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
mutex_unlock(&ls->ls_requestqueue_mutex);
- dlm_receive_message_saved(ls, &e->request);
+ ms = &e->request;
+
+ log_limit(ls, "dlm_process_requestqueue msg %d from %d "
+ "lkid %x remid %x result %d seq %u",
+ ms->m_type, ms->m_header.h_nodeid,
+ ms->m_lkid, ms->m_remid, ms->m_result,
+ e->recover_seq);
+
+ dlm_receive_message_saved(ls, &e->request, e->recover_seq);
mutex_lock(&ls->ls_requestqueue_mutex);
list_del(&e->list);
@@ -138,35 +149,7 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
if (!dlm_no_directory(ls))
return 0;
- /* with no directory, the master is likely to change as a part of
- recovery; requests to/from the defunct master need to be purged */
-
- switch (type) {
- case DLM_MSG_REQUEST:
- case DLM_MSG_CONVERT:
- case DLM_MSG_UNLOCK:
- case DLM_MSG_CANCEL:
- /* we're no longer the master of this resource, the sender
- will resend to the new master (see waiter_needs_recovery) */
-
- if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
- return 1;
- break;
-
- case DLM_MSG_REQUEST_REPLY:
- case DLM_MSG_CONVERT_REPLY:
- case DLM_MSG_UNLOCK_REPLY:
- case DLM_MSG_CANCEL_REPLY:
- case DLM_MSG_GRANT:
- /* this reply is from the former master of the resource,
- we'll resend to the new master if needed */
-
- if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
- return 1;
- break;
- }
-
- return 0;
+ return 1;
}
void dlm_purge_requestqueue(struct dlm_ls *ls)
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index ab224809051..a750f957b14 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -303,7 +303,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
mutex_unlock(&ecryptfs_daemon_hash_mux);
goto wake_up;
}
- tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns;
+ tsk_user_ns = __task_cred(msg_ctx->task)->user_ns;
ctx_euid = task_euid(msg_ctx->task);
rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns);
rcu_read_unlock();
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c0b3c70ee87..079d1be65ba 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -33,6 +33,7 @@
#include <linux/bitops.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
+#include <linux/device.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
@@ -87,7 +88,7 @@
*/
/* Epoll private bits inside the event mask */
-#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
+#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
@@ -154,6 +155,9 @@ struct epitem {
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;
+ /* wakeup_source used when EPOLLWAKEUP is set */
+ struct wakeup_source *ws;
+
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
};
@@ -194,6 +198,9 @@ struct eventpoll {
*/
struct epitem *ovflist;
+ /* wakeup_source used when ep_scan_ready_list is running */
+ struct wakeup_source *ws;
+
/* The user that created the eventpoll descriptor */
struct user_struct *user;
@@ -588,8 +595,10 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them.
*/
- if (!ep_is_linked(&epi->rdllink))
+ if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
+ }
}
/*
* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
@@ -602,6 +611,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* Quickly re-inject items left on "txlist".
*/
list_splice(&txlist, &ep->rdllist);
+ __pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) {
/*
@@ -656,6 +666,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
+ wakeup_source_unregister(epi->ws);
+
/* At this point it is safe to free the eventpoll item */
kmem_cache_free(epi_cache, epi);
@@ -706,6 +718,7 @@ static void ep_free(struct eventpoll *ep)
mutex_unlock(&epmutex);
mutex_destroy(&ep->mtx);
free_uid(ep->user);
+ wakeup_source_unregister(ep->ws);
kfree(ep);
}
@@ -737,6 +750,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
* callback, but it's not actually ready, as far as
* caller requested events goes. We can remove it here.
*/
+ __pm_relax(epi->ws);
list_del_init(&epi->rdllink);
}
}
@@ -927,13 +941,23 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
if (epi->next == EP_UNACTIVE_PTR) {
epi->next = ep->ovflist;
ep->ovflist = epi;
+ if (epi->ws) {
+ /*
+ * Activate ep->ws since epi->ws may get
+ * deactivated at any time.
+ */
+ __pm_stay_awake(ep->ws);
+ }
+
}
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(&epi->rdllink))
+ if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
+ }
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
@@ -1091,6 +1115,30 @@ static int reverse_path_check(void)
return error;
}
+static int ep_create_wakeup_source(struct epitem *epi)
+{
+ const char *name;
+
+ if (!epi->ep->ws) {
+ epi->ep->ws = wakeup_source_register("eventpoll");
+ if (!epi->ep->ws)
+ return -ENOMEM;
+ }
+
+ name = epi->ffd.file->f_path.dentry->d_name.name;
+ epi->ws = wakeup_source_register(name);
+ if (!epi->ws)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void ep_destroy_wakeup_source(struct epitem *epi)
+{
+ wakeup_source_unregister(epi->ws);
+ epi->ws = NULL;
+}
+
/*
* Must be called with "mtx" held.
*/
@@ -1118,6 +1166,13 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
+ if (epi->event.events & EPOLLWAKEUP) {
+ error = ep_create_wakeup_source(epi);
+ if (error)
+ goto error_create_wakeup_source;
+ } else {
+ epi->ws = NULL;
+ }
/* Initialize the poll table using the queue callback */
epq.epi = epi;
@@ -1164,6 +1219,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
@@ -1204,6 +1260,9 @@ error_unregister:
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
+ wakeup_source_unregister(epi->ws);
+
+error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);
return error;
@@ -1229,6 +1288,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
epi->event.events = event->events;
pt._key = event->events;
epi->event.data = event->data; /* protected by mtx */
+ if (epi->event.events & EPOLLWAKEUP) {
+ if (!epi->ws)
+ ep_create_wakeup_source(epi);
+ } else if (epi->ws) {
+ ep_destroy_wakeup_source(epi);
+ }
/*
* Get current event bits. We can safely use the file* here because
@@ -1244,6 +1309,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
@@ -1282,6 +1348,18 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
!list_empty(head) && eventcnt < esed->maxevents;) {
epi = list_first_entry(head, struct epitem, rdllink);
+ /*
+ * Activate ep->ws before deactivating epi->ws to prevent
+ * triggering auto-suspend here (in case we reactive epi->ws
+ * below).
+ *
+ * This could be rearranged to delay the deactivation of epi->ws
+ * instead, but then epi->ws would temporarily be out of sync
+ * with ep_is_linked().
+ */
+ if (epi->ws && epi->ws->active)
+ __pm_stay_awake(ep->ws);
+ __pm_relax(epi->ws);
list_del_init(&epi->rdllink);
pt._key = epi->event.events;
@@ -1298,6 +1376,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
+ __pm_stay_awake(epi->ws);
return eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
@@ -1317,6 +1396,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
}
}
}
@@ -1629,6 +1709,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
if (!tfile->f_op || !tfile->f_op->poll)
goto error_tgt_fput;
+ /* Check if EPOLLWAKEUP is allowed */
+ if ((epds.events & EPOLLWAKEUP) && !capable(CAP_EPOLLWAKEUP))
+ epds.events &= ~EPOLLWAKEUP;
+
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
diff --git a/fs/exec.c b/fs/exec.c
index b1fd2025e59..52c9e2ff6e6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1139,7 +1139,7 @@ void setup_new_exec(struct linux_binprm * bprm)
/* This is the point of no return */
current->sas_ss_sp = current->sas_ss_size = 0;
- if (current_euid() == current_uid() && current_egid() == current_gid())
+ if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
set_dumpable(current->mm, 1);
else
set_dumpable(current->mm, suid_dumpable);
@@ -1153,8 +1153,8 @@ void setup_new_exec(struct linux_binprm * bprm)
current->mm->task_size = TASK_SIZE;
/* install the new credentials */
- if (bprm->cred->uid != current_euid() ||
- bprm->cred->gid != current_egid()) {
+ if (!uid_eq(bprm->cred->uid, current_euid()) ||
+ !gid_eq(bprm->cred->gid, current_egid())) {
current->pdeath_signal = 0;
} else {
would_dump(bprm, bprm->file);
@@ -1245,6 +1245,13 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
bprm->unsafe |= LSM_UNSAFE_PTRACE;
}
+ /*
+ * This isn't strictly necessary, but it makes it harder for LSMs to
+ * mess up.
+ */
+ if (current->no_new_privs)
+ bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+
n_fs = 1;
spin_lock(&p->fs->lock);
rcu_read_lock();
@@ -1288,11 +1295,15 @@ int prepare_binprm(struct linux_binprm *bprm)
bprm->cred->euid = current_euid();
bprm->cred->egid = current_egid();
- if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
+ if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
+ !current->no_new_privs) {
/* Set-uid? */
if (mode & S_ISUID) {
+ if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
+ return -EPERM;
bprm->per_clear |= PER_CLEAR_ON_SETID;
bprm->cred->euid = inode->i_uid;
+
}
/* Set-gid? */
@@ -1302,6 +1313,8 @@ int prepare_binprm(struct linux_binprm *bprm)
* executable.
*/
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+ if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
+ return -EPERM;
bprm->per_clear |= PER_CLEAR_ON_SETID;
bprm->cred->egid = inode->i_gid;
}
@@ -1930,8 +1943,21 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
core_waiters = zap_threads(tsk, mm, core_state, exit_code);
up_write(&mm->mmap_sem);
- if (core_waiters > 0)
+ if (core_waiters > 0) {
+ struct core_thread *ptr;
+
wait_for_completion(&core_state->startup);
+ /*
+ * Wait for all the threads to become inactive, so that
+ * all the thread context (extended register state, like
+ * fpu etc) gets copied to the memory.
+ */
+ ptr = core_state->dumper.next;
+ while (ptr != NULL) {
+ wait_task_inactive(ptr->task, 0);
+ ptr = ptr->next;
+ }
+ }
return core_waiters;
}
@@ -2121,7 +2147,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
if (__get_dumpable(cprm.mm_flags) == 2) {
/* Setuid core dump mode */
flag = O_EXCL; /* Stop rewrite attacks */
- cred->fsuid = 0; /* Dump root private */
+ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
}
retval = coredump_wait(exit_code, &core_state);
@@ -2222,7 +2248,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
* Dont allow local users get cute and trick others to coredump
* into their pre-created files.
*/
- if (inode->i_uid != current_fsuid())
+ if (!uid_eq(inode->i_uid, current_fsuid()))
goto close_fail;
if (!cprm.file->f_op || !cprm.file->f_op->write)
goto close_fail;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index a8cbe1bc6ad..1c361399886 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -165,7 +165,6 @@ static void release_blocks(struct super_block *sb, int count)
struct ext2_sb_info *sbi = EXT2_SB(sb);
percpu_counter_add(&sbi->s_freeblocks_counter, count);
- sb->s_dirt = 1;
}
}
@@ -180,7 +179,6 @@ static void group_adjust_blocks(struct super_block *sb, int group_no,
free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
spin_unlock(sb_bgl_lock(sbi, group_no));
- sb->s_dirt = 1;
mark_buffer_dirty(bh);
}
}
@@ -479,7 +477,7 @@ void ext2_discard_reservation(struct inode *inode)
}
/**
- * ext2_free_blocks_sb() -- Free given blocks and update quota and i_blocks
+ * ext2_free_blocks() -- Free given blocks and update quota and i_blocks
* @inode: inode
* @block: start physcial block to free
* @count: number of blocks to free
@@ -1193,8 +1191,9 @@ static int ext2_has_free_blocks(struct ext2_sb_info *sbi)
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current_fsuid() &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+ !uid_eq(sbi->s_resuid, current_fsuid()) &&
+ (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
+ !in_group_p (sbi->s_resgid))) {
return 0;
}
return 1;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b2b4db5bdc..d9a17d0b124 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -82,8 +82,8 @@ struct ext2_sb_info {
struct buffer_head ** s_group_desc;
unsigned long s_mount_opt;
unsigned long s_sb_block;
- uid_t s_resuid;
- gid_t s_resgid;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
unsigned short s_mount_state;
unsigned short s_pad;
int s_addr_per_block_bits;
@@ -637,8 +637,8 @@ static inline void verify_offsets(void)
*/
struct ext2_mount_options {
unsigned long s_mount_opt;
- uid_t s_resuid;
- gid_t s_resgid;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
};
/*
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 8b15cf8cef3..c13eb7b91a1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -81,7 +81,6 @@ static void ext2_release_inode(struct super_block *sb, int group, int dir)
spin_unlock(sb_bgl_lock(EXT2_SB(sb), group));
if (dir)
percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter);
- sb->s_dirt = 1;
mark_buffer_dirty(bh);
}
@@ -543,7 +542,6 @@ got:
}
spin_unlock(sb_bgl_lock(sbi, group));
- sb->s_dirt = 1;
mark_buffer_dirty(bh2);
if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 740cad8dcd8..f9fa95f8443 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1293,6 +1293,8 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
struct inode *inode;
long ret = -EIO;
int n;
+ uid_t i_uid;
+ gid_t i_gid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -1310,12 +1312,14 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
}
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
if (!(test_opt (inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
@@ -1413,8 +1417,8 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
struct ext2_inode_info *ei = EXT2_I(inode);
struct super_block *sb = inode->i_sb;
ino_t ino = inode->i_ino;
- uid_t uid = inode->i_uid;
- gid_t gid = inode->i_gid;
+ uid_t uid = i_uid_read(inode);
+ gid_t gid = i_gid_read(inode);
struct buffer_head * bh;
struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh);
int n;
@@ -1529,8 +1533,8 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
if (is_quota_modification(inode, iattr))
dquot_initialize(inode);
- if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
- (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+ if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
+ (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
error = dquot_transfer(inode, iattr);
if (error)
return error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dffb8653628..f663a67d7bf 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -79,7 +79,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
struct dentry *ext2_get_parent(struct dentry *child)
{
- struct qstr dotdot = {.name = "..", .len = 2};
+ struct qstr dotdot = QSTR_INIT("..", 2);
unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot);
if (!ino)
return ERR_PTR(-ENOENT);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e1025c7a437..b3621cb7ea3 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -130,9 +130,6 @@ static void ext2_put_super (struct super_block * sb)
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- if (sb->s_dirt)
- ext2_write_super(sb);
-
ext2_xattr_put_super(sb);
if (!(sb->s_flags & MS_RDONLY)) {
struct ext2_super_block *es = sbi->s_es;
@@ -228,13 +225,15 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",grpid");
if (!test_opt(sb, GRPID) && (def_mount_opts & EXT2_DEFM_BSDGROUPS))
seq_puts(seq, ",nogrpid");
- if (sbi->s_resuid != EXT2_DEF_RESUID ||
+ if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT2_DEF_RESUID)) ||
le16_to_cpu(es->s_def_resuid) != EXT2_DEF_RESUID) {
- seq_printf(seq, ",resuid=%u", sbi->s_resuid);
+ seq_printf(seq, ",resuid=%u",
+ from_kuid_munged(&init_user_ns, sbi->s_resuid));
}
- if (sbi->s_resgid != EXT2_DEF_RESGID ||
+ if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT2_DEF_RESGID)) ||
le16_to_cpu(es->s_def_resgid) != EXT2_DEF_RESGID) {
- seq_printf(seq, ",resgid=%u", sbi->s_resgid);
+ seq_printf(seq, ",resgid=%u",
+ from_kgid_munged(&init_user_ns, sbi->s_resgid));
}
if (test_opt(sb, ERRORS_RO)) {
int def_errors = le16_to_cpu(es->s_errors);
@@ -305,7 +304,6 @@ static const struct super_operations ext2_sops = {
.write_inode = ext2_write_inode,
.evict_inode = ext2_evict_inode,
.put_super = ext2_put_super,
- .write_super = ext2_write_super,
.sync_fs = ext2_sync_fs,
.statfs = ext2_statfs,
.remount_fs = ext2_remount,
@@ -356,11 +354,6 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid,
ext2_nfs_get_inode);
}
-/* Yes, most of these are left as NULL!!
- * A NULL value implies the default, which works with ext2-like file
- * systems, but can be improved upon.
- * Currently only get_parent is required.
- */
static const struct export_operations ext2_export_ops = {
.fh_to_dentry = ext2_fh_to_dentry,
.fh_to_parent = ext2_fh_to_parent,
@@ -436,6 +429,8 @@ static int parse_options(char *options, struct super_block *sb)
struct ext2_sb_info *sbi = EXT2_SB(sb);
substring_t args[MAX_OPT_ARGS];
int option;
+ kuid_t uid;
+ kgid_t gid;
if (!options)
return 1;
@@ -462,12 +457,23 @@ static int parse_options(char *options, struct super_block *sb)
case Opt_resuid:
if (match_int(&args[0], &option))
return 0;
- sbi->s_resuid = option;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid)) {
+ ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option);
+ return -1;
+
+ }
+ sbi->s_resuid = uid;
break;
case Opt_resgid:
if (match_int(&args[0], &option))
return 0;
- sbi->s_resgid = option;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid)) {
+ ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option);
+ return -1;
+ }
+ sbi->s_resgid = gid;
break;
case Opt_sb:
/* handled by get_sb_block() instead of here */
@@ -841,8 +847,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
else
set_opt(sbi->s_mount_opt, ERRORS_RO);
- sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
- sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
set_opt(sbi->s_mount_opt, RESERVATION);
@@ -1161,7 +1167,6 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
if (wait)
sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
- sb->s_dirt = 0;
}
/*
@@ -1194,8 +1199,6 @@ void ext2_write_super(struct super_block *sb)
{
if (!(sb->s_flags & MS_RDONLY))
ext2_sync_fs(sb, 1);
- else
- sb->s_dirt = 0;
}
static int ext2_remount (struct super_block * sb, int * flags, char * data)
@@ -1441,7 +1444,6 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
struct buffer_head tmp_bh;
struct buffer_head *bh;
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
while (towrite > 0) {
tocopy = sb->s_blocksize - offset < towrite ?
sb->s_blocksize - offset : towrite;
@@ -1471,16 +1473,13 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite) {
- mutex_unlock(&inode->i_mutex);
+ if (len == towrite)
return err;
- }
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
inode->i_version++;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
- mutex_unlock(&inode->i_mutex);
return len - towrite;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 6dcafc7efdf..b6754dbbce3 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -339,7 +339,6 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
spin_lock(&EXT2_SB(sb)->s_lock);
EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
spin_unlock(&EXT2_SB(sb)->s_lock);
- sb->s_dirt = 1;
mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
}
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index baac1b129fb..25cd6089211 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1439,8 +1439,9 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
- !use_reservation && sbi->s_resuid != current_fsuid() &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+ !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) &&
+ (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
+ !in_group_p (sbi->s_resgid))) {
return 0;
}
return 1;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index cc761ad8fa5..92490e9f85c 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -21,30 +21,15 @@
*
*/
+#include <linux/compat.h>
#include "ext3.h"
static unsigned char ext3_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int ext3_readdir(struct file *, void *, filldir_t);
static int ext3_dx_readdir(struct file * filp,
void * dirent, filldir_t filldir);
-static int ext3_release_dir (struct inode * inode,
- struct file * filp);
-
-const struct file_operations ext3_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .readdir = ext3_readdir, /* we take BKL. needed?*/
- .unlocked_ioctl = ext3_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext3_compat_ioctl,
-#endif
- .fsync = ext3_sync_file, /* BKL held */
- .release = ext3_release_dir,
-};
-
static unsigned char get_dtype(struct super_block *sb, int filetype)
{
@@ -55,6 +40,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
return (ext3_filetype_table[filetype]);
}
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which chould potentially get coverted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT3_FEATURE_COMPAT_DIR_INDEX) &&
+ ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
+ ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+ return 1;
+
+ return 0;
+}
int ext3_check_dir_entry (const char * function, struct inode * dir,
struct ext3_dir_entry_2 * de,
@@ -94,18 +98,13 @@ static int ext3_readdir(struct file * filp,
unsigned long offset;
int i, stored;
struct ext3_dir_entry_2 *de;
- struct super_block *sb;
int err;
struct inode *inode = filp->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
int ret = 0;
int dir_has_error = 0;
- sb = inode->i_sb;
-
- if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT3_FEATURE_COMPAT_DIR_INDEX) &&
- ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
- ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+ if (is_dx_dir(inode)) {
err = ext3_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
ret = err;
@@ -227,22 +226,87 @@ out:
return ret;
}
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return is_compat_task();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
+}
+
/*
* These functions convert from the major/minor hash to an f_pos
- * value.
+ * value for dx directories
*
- * Currently we only use major hash numer. This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie. Sigh.
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
*/
-#define hash2pos(major, minor) (major >> 1)
-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos) (0)
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return major >> 1;
+ else
+ return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return (pos << 1) & 0xffffffff;
+ else
+ return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return 0;
+ else
+ return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext3_get_htree_eof(struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return EXT3_HTREE_EOF_32BIT;
+ else
+ return EXT3_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
+ * non-htree and htree directories, where the "offset" is in terms
+ * of the filename hash value instead of the byte offset.
+ *
+ * Because we may return a 64-bit hash that is well beyond s_maxbytes,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
+ *
+ * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
+ * will be invalid once the directory was converted into a dx directory
+ */
+loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int dx_dir = is_dx_dir(inode);
+
+ if (likely(dx_dir))
+ return generic_file_llseek_size(file, offset, origin,
+ ext3_get_htree_eof(file));
+ else
+ return generic_file_llseek(file, offset, origin);
+}
/*
* This structure holds the nodes of the red-black tree used to store
@@ -303,15 +367,16 @@ static void free_rb_tree_fname(struct rb_root *root)
}
-static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
+static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
+ loff_t pos)
{
struct dir_private_info *p;
p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->curr_hash = pos2maj_hash(pos);
- p->curr_minor_hash = pos2min_hash(pos);
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
return p;
}
@@ -401,7 +466,7 @@ static int call_filldir(struct file * filp, void * dirent,
printk("call_filldir: called with null fname?!?\n");
return 0;
}
- curr_pos = hash2pos(fname->hash, fname->minor_hash);
+ curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
while (fname) {
error = filldir(dirent, fname->name,
fname->name_len, curr_pos,
@@ -426,13 +491,13 @@ static int ext3_dx_readdir(struct file * filp,
int ret;
if (!info) {
- info = ext3_htree_create_dir_info(filp->f_pos);
+ info = ext3_htree_create_dir_info(filp, filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
}
- if (filp->f_pos == EXT3_HTREE_EOF)
+ if (filp->f_pos == ext3_get_htree_eof(filp))
return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */
@@ -440,8 +505,8 @@ static int ext3_dx_readdir(struct file * filp,
free_rb_tree_fname(&info->root);
info->curr_node = NULL;
info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(filp->f_pos);
- info->curr_minor_hash = pos2min_hash(filp->f_pos);
+ info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+ info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
}
/*
@@ -473,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
if (ret < 0)
return ret;
if (ret == 0) {
- filp->f_pos = EXT3_HTREE_EOF;
+ filp->f_pos = ext3_get_htree_eof(filp);
break;
}
info->curr_node = rb_first(&info->root);
@@ -493,7 +558,7 @@ static int ext3_dx_readdir(struct file * filp,
info->curr_minor_hash = fname->minor_hash;
} else {
if (info->next_hash == ~0) {
- filp->f_pos = EXT3_HTREE_EOF;
+ filp->f_pos = ext3_get_htree_eof(filp);
break;
}
info->curr_hash = info->next_hash;
@@ -512,3 +577,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
return 0;
}
+
+const struct file_operations ext3_dir_operations = {
+ .llseek = ext3_dir_llseek,
+ .read = generic_read_dir,
+ .readdir = ext3_readdir,
+ .unlocked_ioctl = ext3_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext3_compat_ioctl,
+#endif
+ .fsync = ext3_sync_file,
+ .release = ext3_release_dir,
+};
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
index b6515fd7e56..e85ff15a060 100644
--- a/fs/ext3/ext3.h
+++ b/fs/ext3/ext3.h
@@ -243,8 +243,8 @@ struct ext3_new_group_data {
*/
struct ext3_mount_options {
unsigned long s_mount_opt;
- uid_t s_resuid;
- gid_t s_resgid;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
unsigned long s_commit_interval;
#ifdef CONFIG_QUOTA
int s_jquota_fmt;
@@ -637,8 +637,8 @@ struct ext3_sb_info {
struct buffer_head ** s_group_desc;
unsigned long s_mount_opt;
ext3_fsblk_t s_sb_block;
- uid_t s_resuid;
- gid_t s_resgid;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
unsigned short s_mount_state;
unsigned short s_pad;
int s_addr_per_block_bits;
@@ -920,7 +920,11 @@ struct dx_hash_info
u32 *seed;
};
-#define EXT3_HTREE_EOF 0x7fffffff
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
+#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
+
/*
* Control parameters used by ext3_htree_next_block
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index d10231ddcf8..ede315cdf12 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -198,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
return -1;
}
hash = hash & ~1;
- if (hash == (EXT3_HTREE_EOF << 1))
- hash = (EXT3_HTREE_EOF-1) << 1;
+ if (hash == (EXT3_HTREE_EOF_32BIT << 1))
+ hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
hinfo->hash = hash;
hinfo->minor_hash = minor_hash;
return 0;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index e3c39e4cec1..082afd78b10 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -180,8 +180,7 @@ error_return:
* It's OK to put directory into a group unless
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
+ * it has too few free blocks left (min_blocks).
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
@@ -191,21 +190,16 @@ error_return:
* when we allocate an inode, within 0--255.
*/
-#define INODE_COST 64
-#define BLOCK_COST 256
-
static int find_group_orlov(struct super_block *sb, struct inode *parent)
{
int parent_group = EXT3_I(parent)->i_block_group;
struct ext3_sb_info *sbi = EXT3_SB(sb);
- struct ext3_super_block *es = sbi->s_es;
int ngroups = sbi->s_groups_count;
int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext3_fsblk_t freeb, avefreeb;
- ext3_fsblk_t blocks_per_dir;
unsigned int ndirs;
- int max_debt, max_dirs, min_inodes;
+ int max_dirs, min_inodes;
ext3_grpblk_t min_blocks;
int group = -1, i;
struct ext3_group_desc *desc;
@@ -242,20 +236,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
goto fallback;
}
- blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs;
-
max_dirs = ndirs / ngroups + inodes_per_group / 16;
min_inodes = avefreei - inodes_per_group / 4;
min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
- max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
- if (max_debt * INODE_COST > inodes_per_group)
- max_debt = inodes_per_group / INODE_COST;
- if (max_debt > 255)
- max_debt = 255;
- if (max_debt == 0)
- max_debt = 1;
-
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
desc = ext3_get_group_desc (sb, group, NULL);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 10d7812f602..a09790a412b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2891,6 +2891,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
transaction_t *transaction;
long ret;
int block;
+ uid_t i_uid;
+ gid_t i_gid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -2907,12 +2909,14 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
bh = iloc.bh;
raw_inode = ext3_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
if(!(test_opt (inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
@@ -3068,6 +3072,8 @@ static int ext3_do_update_inode(handle_t *handle,
struct ext3_inode_info *ei = EXT3_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+ uid_t i_uid;
+ gid_t i_gid;
again:
/* we can't allow multiple procs in here at once, its a bit racey */
@@ -3080,27 +3086,29 @@ again:
ext3_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
if(!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
if(!ei->i_dtime) {
raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(inode->i_uid));
+ cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(inode->i_gid));
+ cpu_to_le16(high_16_bits(i_gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(inode->i_uid));
+ cpu_to_le16(fs_high2lowuid(i_uid));
raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ cpu_to_le16(fs_high2lowgid(i_gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
@@ -3262,8 +3270,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
if (is_quota_modification(inode, attr))
dquot_initialize(inode);
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index d7940b24cf6..eeb63dfc5d2 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1045,7 +1045,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
struct dentry *ext3_get_parent(struct dentry *child)
{
unsigned long ino;
- struct qstr dotdot = {.name = "..", .len = 2};
+ struct qstr dotdot = QSTR_INIT("..", 2);
struct ext3_dir_entry_2 * de;
struct buffer_head *bh;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index cf0b5921cf0..8c3a44b7c37 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -617,13 +617,15 @@ static int ext3_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",grpid");
if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS))
seq_puts(seq, ",nogrpid");
- if (sbi->s_resuid != EXT3_DEF_RESUID ||
+ if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) ||
le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) {
- seq_printf(seq, ",resuid=%u", sbi->s_resuid);
+ seq_printf(seq, ",resuid=%u",
+ from_kuid_munged(&init_user_ns, sbi->s_resuid));
}
- if (sbi->s_resgid != EXT3_DEF_RESGID ||
+ if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) ||
le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) {
- seq_printf(seq, ",resgid=%u", sbi->s_resgid);
+ seq_printf(seq, ",resgid=%u",
+ from_kgid_munged(&init_user_ns, sbi->s_resgid));
}
if (test_opt(sb, ERRORS_RO)) {
int def_errors = le16_to_cpu(es->s_errors);
@@ -967,6 +969,8 @@ static int parse_options (char *options, struct super_block *sb,
substring_t args[MAX_OPT_ARGS];
int data_opt = 0;
int option;
+ kuid_t uid;
+ kgid_t gid;
#ifdef CONFIG_QUOTA
int qfmt;
#endif
@@ -1000,12 +1004,23 @@ static int parse_options (char *options, struct super_block *sb,
case Opt_resuid:
if (match_int(&args[0], &option))
return 0;
- sbi->s_resuid = option;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid)) {
+ ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
+ return -1;
+
+ }
+ sbi->s_resuid = uid;
break;
case Opt_resgid:
if (match_int(&args[0], &option))
return 0;
- sbi->s_resgid = option;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid)) {
+ ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
+ return -1;
+ }
+ sbi->s_resgid = gid;
break;
case Opt_sb:
/* handled by get_sb_block() instead of here */
@@ -1651,8 +1666,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_mount_opt = 0;
- sbi->s_resuid = EXT3_DEF_RESUID;
- sbi->s_resgid = EXT3_DEF_RESGID;
+ sbi->s_resuid = make_kuid(&init_user_ns, EXT3_DEF_RESUID);
+ sbi->s_resgid = make_kgid(&init_user_ns, EXT3_DEF_RESGID);
sbi->s_sb_block = sb_block;
blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
@@ -1716,8 +1731,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
else
set_opt(sbi->s_mount_opt, ERRORS_RO);
- sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
- sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
/* enable barriers by default */
set_opt(sbi->s_mount_opt, BARRIER);
@@ -3000,7 +3015,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
bh = ext3_bread(handle, inode, blk, 1, &err);
if (!bh)
goto out;
@@ -3024,10 +3038,8 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
}
brelse(bh);
out:
- if (err) {
- mutex_unlock(&inode->i_mutex);
+ if (err)
return err;
- }
if (inode->i_size < off + len) {
i_size_write(inode, off + len);
EXT3_I(inode)->i_disksize = inode->i_size;
@@ -3035,7 +3047,6 @@ out:
inode->i_version++;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ext3_mark_inode_dirty(handle, inode);
- mutex_unlock(&inode->i_mutex);
return len;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 4bbd07a6fa1..c45c41129a3 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -461,8 +461,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
return 1;
/* Hm, nope. Are (enough) root reserved clusters available? */
- if (sbi->s_resuid == current_fsuid() ||
- ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+ if (uid_eq(sbi->s_resuid, current_fsuid()) ||
+ (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE) ||
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0e01e90add8..c21b1de51af 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1153,8 +1153,8 @@ struct ext4_sb_info {
unsigned int s_mount_flags;
unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
- uid_t s_resuid;
- gid_t s_resgid;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
unsigned short s_mount_state;
unsigned short s_pad;
int s_addr_per_block_bits;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 409c2ee7750..9f9acac6c43 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -808,8 +808,8 @@ got:
}
if (owner) {
inode->i_mode = mode;
- inode->i_uid = owner[0];
- inode->i_gid = owner[1];
+ i_uid_write(inode, owner[0]);
+ i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
inode->i_uid = current_fsuid();
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c77b0bd2c71..07eaf565fdc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3630,6 +3630,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
+ uid_t i_uid;
+ gid_t i_gid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -3645,12 +3647,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
if (!(test_opt(inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
@@ -3870,6 +3874,8 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+ uid_t i_uid;
+ gid_t i_gid;
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
@@ -3878,27 +3884,27 @@ static int ext4_do_update_inode(handle_t *handle,
ext4_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
if (!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
if (!ei->i_dtime) {
raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(inode->i_uid));
+ cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(inode->i_gid));
+ cpu_to_le16(high_16_bits(i_gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
- raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(inode->i_uid));
- raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
@@ -4084,8 +4090,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (is_quota_modification(inode, attr))
dquot_initialize(inode);
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f39f80f8f2c..f1bb32ec016 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -466,8 +466,8 @@ int ext4_ext_migrate(struct inode *inode)
}
goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
- owner[0] = inode->i_uid;
- owner[1] = inode->i_gid;
+ owner[0] = i_uid_read(inode);
+ owner[1] = i_gid_read(inode);
tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
S_IFREG, NULL, goal, owner);
if (IS_ERR(tmp_inode)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 349d7b3671c..e2a3f4b0ff7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1052,10 +1052,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
struct dentry *ext4_get_parent(struct dentry *child)
{
__u32 ino;
- static const struct qstr dotdot = {
- .name = "..",
- .len = 2,
- };
+ static const struct qstr dotdot = QSTR_INIT("..", 2);
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e1fb1d5de58..1867a98e0c4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1448,6 +1448,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
const struct mount_opts *m;
+ kuid_t uid;
+ kgid_t gid;
int arg = 0;
#ifdef CONFIG_QUOTA
@@ -1474,10 +1476,20 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
"Ignoring removed %s option", opt);
return 1;
case Opt_resuid:
- sbi->s_resuid = arg;
+ uid = make_kuid(current_user_ns(), arg);
+ if (!uid_valid(uid)) {
+ ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
+ return -1;
+ }
+ sbi->s_resuid = uid;
return 1;
case Opt_resgid:
- sbi->s_resgid = arg;
+ gid = make_kgid(current_user_ns(), arg);
+ if (!gid_valid(gid)) {
+ ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
+ return -1;
+ }
+ sbi->s_resgid = gid;
return 1;
case Opt_abort:
sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
@@ -1732,12 +1744,14 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("%s", token2str(m->token));
}
- if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
+ if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
- SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
- if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
+ SEQ_OPTS_PRINT("resuid=%u",
+ from_kuid_munged(&init_user_ns, sbi->s_resuid));
+ if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
- SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
+ SEQ_OPTS_PRINT("resgid=%u",
+ from_kgid_munged(&init_user_ns, sbi->s_resgid));
def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
SEQ_OPTS_PUTS("errors=remount-ro");
@@ -2980,8 +2994,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_mount_opt = 0;
- sbi->s_resuid = EXT4_DEF_RESUID;
- sbi->s_resgid = EXT4_DEF_RESGID;
+ sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
+ sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
if (sb->s_bdev->bd_part)
@@ -3060,8 +3074,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (def_mount_opts & EXT4_DEFM_DISCARD)
set_opt(sb, DISCARD);
- sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
- sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
@@ -4213,8 +4227,8 @@ static int ext4_unfreeze(struct super_block *sb)
struct ext4_mount_options {
unsigned long s_mount_opt;
unsigned long s_mount_opt2;
- uid_t s_resuid;
- gid_t s_resgid;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
unsigned long s_commit_interval;
u32 s_min_batch_time, s_max_batch_time;
#ifdef CONFIG_QUOTA
@@ -4744,7 +4758,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
return -EIO;
}
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
bh = ext4_bread(handle, inode, blk, 1, &err);
if (!bh)
goto out;
@@ -4760,16 +4773,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
out:
- if (err) {
- mutex_unlock(&inode->i_mutex);
+ if (err)
return err;
- }
if (inode->i_size < off + len) {
i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
ext4_mark_inode_dirty(handle, inode);
}
- mutex_unlock(&inode->i_mutex);
return len;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 75e7c1f3a08..d078b75572a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -532,9 +532,9 @@ static inline int sigio_perm(struct task_struct *p,
rcu_read_lock();
cred = __task_cred(p);
- ret = ((fown->euid == 0 ||
- fown->euid == cred->suid || fown->euid == cred->uid ||
- fown->uid == cred->suid || fown->uid == cred->uid) &&
+ ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
+ uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
+ uid_eq(fown->uid, cred->suid) || uid_eq(fown->uid, cred->uid)) &&
!security_file_send_sigiotask(p, fown, sig));
rcu_read_unlock();
return ret;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 230eb0f005b..bd4a5892c93 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -73,12 +73,8 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
int error = 0;
if (mode != inode->i_mode) {
- struct iattr iattr;
-
- iattr.ia_valid = ATTR_MODE;
- iattr.ia_mode = mode;
-
- error = gfs2_setattr_simple(inode, &iattr);
+ inode->i_mode = mode;
+ mark_inode_dirty(inode);
}
return error;
@@ -126,9 +122,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
return PTR_ERR(acl);
if (!acl) {
mode &= ~current_umask();
- if (mode != inode->i_mode)
- error = gfs2_set_mode(inode, mode);
- return error;
+ return gfs2_set_mode(inode, mode);
}
if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9b2ff0e851b..e80a464850c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
#include "glops.h"
-void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
- unsigned int from, unsigned int to)
+static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+ unsigned int from, unsigned int to)
{
struct buffer_head *head = page_buffers(page);
unsigned int bsize = head->b_size;
@@ -517,15 +517,14 @@ out:
/**
* gfs2_internal_read - read an internal file
* @ip: The gfs2 inode
- * @ra_state: The readahead state (or NULL for no readahead)
* @buf: The buffer to fill
* @pos: The file position
* @size: The amount to read
*
*/
-int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
- char *buf, loff_t *pos, unsigned size)
+int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
+ unsigned size)
{
struct address_space *mapping = ip->i_inode.i_mapping;
unsigned long index = *pos / PAGE_CACHE_SIZE;
@@ -943,8 +942,8 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
clear_buffer_dirty(bh);
bd = bh->b_private;
if (bd) {
- if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
- list_del_init(&bd->bd_le.le_list);
+ if (!list_empty(&bd->bd_list) && !buffer_pinned(bh))
+ list_del_init(&bd->bd_list);
else
gfs2_remove_from_journal(bh, current->journal_info, 0);
}
@@ -1084,10 +1083,9 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
bd = bh->b_private;
if (bd) {
gfs2_assert_warn(sdp, bd->bd_bh == bh);
- gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
- if (!list_empty(&bd->bd_le.le_list)) {
+ if (!list_empty(&bd->bd_list)) {
if (!buffer_pinned(bh))
- list_del_init(&bd->bd_le.le_list);
+ list_del_init(&bd->bd_list);
else
bd = NULL;
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 03c04febe26..dab54099dd9 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -324,7 +324,7 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
if (!dblock)
return x + 1;
- ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]);
+ ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
if (ret)
return ret;
}
@@ -882,7 +882,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
} else {
- error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+ error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
if (error)
return error;
@@ -1169,6 +1169,7 @@ static int do_grow(struct inode *inode, u64 size)
struct buffer_head *dibh;
struct gfs2_qadata *qa = NULL;
int error;
+ int unstuff = 0;
if (gfs2_is_stuffed(ip) &&
(size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
@@ -1183,13 +1184,14 @@ static int do_grow(struct inode *inode, u64 size)
error = gfs2_inplace_reserve(ip, 1);
if (error)
goto do_grow_qunlock;
+ unstuff = 1;
}
error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
if (error)
goto do_grow_release;
- if (qa) {
+ if (unstuff) {
error = gfs2_unstuff_dinode(ip, NULL);
if (error)
goto do_end_trans;
@@ -1208,7 +1210,7 @@ static int do_grow(struct inode *inode, u64 size)
do_end_trans:
gfs2_trans_end(sdp);
do_grow_release:
- if (qa) {
+ if (unstuff) {
gfs2_inplace_release(ip);
do_grow_qunlock:
gfs2_quota_unlock(ip);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index a836056343f..8aaeb07a07b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -821,7 +821,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
struct buffer_head *bh;
struct gfs2_leaf *leaf;
struct gfs2_dirent *dent;
- struct qstr name = { .name = "", .len = 0, .hash = 0 };
+ struct qstr name = { .name = "" };
error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
if (error)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a3d2c9ee8d6..31b199f6efc 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -558,14 +558,14 @@ fail:
}
/**
- * gfs2_close - called to close a struct file
+ * gfs2_release - called to close a struct file
* @inode: the inode the struct file belongs to
* @file: the struct file being closed
*
* Returns: errno
*/
-static int gfs2_close(struct inode *inode, struct file *file)
+static int gfs2_release(struct inode *inode, struct file *file)
{
struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
struct gfs2_file *fp;
@@ -1005,7 +1005,7 @@ const struct file_operations gfs2_file_fops = {
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
- .release = gfs2_close,
+ .release = gfs2_release,
.fsync = gfs2_fsync,
.lock = gfs2_lock,
.flock = gfs2_flock,
@@ -1019,7 +1019,7 @@ const struct file_operations gfs2_dir_fops = {
.readdir = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
- .release = gfs2_close,
+ .release = gfs2_release,
.fsync = gfs2_fsync,
.lock = gfs2_lock,
.flock = gfs2_flock,
@@ -1037,7 +1037,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
- .release = gfs2_close,
+ .release = gfs2_release,
.fsync = gfs2_fsync,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
@@ -1049,7 +1049,7 @@ const struct file_operations gfs2_dir_fops_nolock = {
.readdir = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
.open = gfs2_open,
- .release = gfs2_close,
+ .release = gfs2_release,
.fsync = gfs2_fsync,
.llseek = default_llseek,
};
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 1656df7aacd..4bdcf378418 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -94,7 +94,6 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
/* A shortened, inline version of gfs2_trans_begin() */
tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
tr.tr_ip = (unsigned long)__builtin_return_address(0);
- INIT_LIST_HEAD(&tr.tr_list_buf);
gfs2_log_reserve(sdp, tr.tr_reserved);
BUG_ON(current->journal_info);
current->journal_info = &tr;
@@ -379,11 +378,6 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
if (error)
return error;
- if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
- brelse(dibh);
- return -EIO;
- }
-
error = gfs2_dinode_in(ip, dibh->b_data);
brelse(dibh);
clear_bit(GIF_INVALID, &ip->i_flags);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 47d0bda5ac2..67fd6beffec 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -26,7 +26,7 @@
#define DIO_METADATA 0x00000020
struct gfs2_log_operations;
-struct gfs2_log_element;
+struct gfs2_bufdata;
struct gfs2_holder;
struct gfs2_glock;
struct gfs2_quota_data;
@@ -52,7 +52,7 @@ struct gfs2_log_header_host {
*/
struct gfs2_log_operations {
- void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
+ void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
void (*lo_before_commit) (struct gfs2_sbd *sdp);
void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
void (*lo_before_scan) (struct gfs2_jdesc *jd,
@@ -64,11 +64,6 @@ struct gfs2_log_operations {
const char *lo_name;
};
-struct gfs2_log_element {
- struct list_head le_list;
- const struct gfs2_log_operations *le_ops;
-};
-
#define GBF_FULL 1
struct gfs2_bitmap {
@@ -118,15 +113,10 @@ TAS_BUFFER_FNS(Zeronew, zeronew)
struct gfs2_bufdata {
struct buffer_head *bd_bh;
struct gfs2_glock *bd_gl;
+ u64 bd_blkno;
- union {
- struct list_head list_tr;
- u64 blkno;
- } u;
-#define bd_list_tr u.list_tr
-#define bd_blkno u.blkno
-
- struct gfs2_log_element bd_le;
+ struct list_head bd_list;
+ const struct gfs2_log_operations *bd_ops;
struct gfs2_ail *bd_ail;
struct list_head bd_ail_st_list;
@@ -411,13 +401,10 @@ struct gfs2_trans {
int tr_touched;
- unsigned int tr_num_buf;
unsigned int tr_num_buf_new;
unsigned int tr_num_databuf_new;
unsigned int tr_num_buf_rm;
unsigned int tr_num_databuf_rm;
- struct list_head tr_list_buf;
-
unsigned int tr_num_revoke;
unsigned int tr_num_revoke_rm;
};
@@ -556,7 +543,6 @@ struct gfs2_sb_host {
struct lm_lockstruct {
int ls_jid;
unsigned int ls_first;
- unsigned int ls_nodir;
const struct lm_lockops *ls_ops;
dlm_lockspace_t *ls_dlm;
@@ -699,7 +685,6 @@ struct gfs2_sbd {
struct list_head sd_log_le_buf;
struct list_head sd_log_le_revoke;
- struct list_head sd_log_le_rg;
struct list_head sd_log_le_databuf;
struct list_head sd_log_le_ordered;
@@ -716,7 +701,9 @@ struct gfs2_sbd {
struct rw_semaphore sd_log_flush_lock;
atomic_t sd_log_in_flight;
+ struct bio *sd_log_bio;
wait_queue_head_t sd_log_flush_wait;
+ int sd_log_error;
unsigned int sd_log_flush_head;
u64 sd_log_flush_wrapped;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 276e7b52b65..c53c7477f6d 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -17,10 +17,7 @@
extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
extern int gfs2_internal_read(struct gfs2_inode *ip,
- struct file_ra_state *ra_state,
char *buf, loff_t *pos, unsigned size);
-extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
- unsigned int from, unsigned int to);
extern void gfs2_set_aops(struct inode *inode);
static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 5f5e70e047d..4a38db739ca 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1209,8 +1209,6 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
fsname++;
flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
- if (ls->ls_nodir)
- flags |= DLM_LSFL_NODIR;
/*
* create/join lockspace
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4752eadc7f6..f4beeb9c81c 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -32,8 +32,6 @@
#include "dir.h"
#include "trace_gfs2.h"
-#define PULL 1
-
/**
* gfs2_struct2blk - compute stuff
* @sdp: the filesystem
@@ -359,18 +357,6 @@ retry:
return 0;
}
-u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
-{
- struct gfs2_journal_extent *je;
-
- list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
- if (lbn >= je->lblock && lbn < je->lblock + je->blocks)
- return je->dblock + lbn - je->lblock;
- }
-
- return -1;
-}
-
/**
* log_distance - Compute distance between two journal blocks
* @sdp: The GFS2 superblock
@@ -466,17 +452,6 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
return tail;
}
-void gfs2_log_incr_head(struct gfs2_sbd *sdp)
-{
- BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) &&
- (sdp->sd_log_flush_head != sdp->sd_log_head));
-
- if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
- sdp->sd_log_flush_head = 0;
- sdp->sd_log_flush_wrapped = 1;
- }
-}
-
static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
{
unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
@@ -511,8 +486,8 @@ static int bd_cmp(void *priv, struct list_head *a, struct list_head *b)
{
struct gfs2_bufdata *bda, *bdb;
- bda = list_entry(a, struct gfs2_bufdata, bd_le.le_list);
- bdb = list_entry(b, struct gfs2_bufdata, bd_le.le_list);
+ bda = list_entry(a, struct gfs2_bufdata, bd_list);
+ bdb = list_entry(b, struct gfs2_bufdata, bd_list);
if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
return -1;
@@ -530,8 +505,8 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
gfs2_log_lock(sdp);
list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp);
while (!list_empty(&sdp->sd_log_le_ordered)) {
- bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list);
- list_move(&bd->bd_le.le_list, &written);
+ bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list);
+ list_move(&bd->bd_list, &written);
bh = bd->bd_bh;
if (!buffer_dirty(bh))
continue;
@@ -558,7 +533,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
gfs2_log_lock(sdp);
while (!list_empty(&sdp->sd_log_le_ordered)) {
- bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list);
+ bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list);
bh = bd->bd_bh;
if (buffer_locked(bh)) {
get_bh(bh);
@@ -568,7 +543,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
gfs2_log_lock(sdp);
continue;
}
- list_del_init(&bd->bd_le.le_list);
+ list_del_init(&bd->bd_list);
}
gfs2_log_unlock(sdp);
}
@@ -580,25 +555,19 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
* Returns: the initialized log buffer descriptor
*/
-static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
+static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
{
- u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head);
- struct buffer_head *bh;
struct gfs2_log_header *lh;
unsigned int tail;
u32 hash;
-
- bh = sb_getblk(sdp->sd_vfs, blkno);
- lock_buffer(bh);
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
+ int rw = WRITE_FLUSH_FUA | REQ_META;
+ struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+ lh = page_address(page);
+ clear_page(lh);
gfs2_ail1_empty(sdp);
tail = current_tail(sdp);
- lh = (struct gfs2_log_header *)bh->b_data;
- memset(lh, 0, sizeof(struct gfs2_log_header));
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
lh->lh_header.__pad0 = cpu_to_be64(0);
@@ -608,31 +577,22 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
lh->lh_flags = cpu_to_be32(flags);
lh->lh_tail = cpu_to_be32(tail);
lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
- hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
+ hash = gfs2_disk_hash(page_address(page), sizeof(struct gfs2_log_header));
lh->lh_hash = cpu_to_be32(hash);
- bh->b_end_io = end_buffer_write_sync;
- get_bh(bh);
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
gfs2_ordered_wait(sdp);
log_flush_wait(sdp);
- submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
- } else {
- submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+ rw = WRITE_SYNC | REQ_META | REQ_PRIO;
}
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- gfs2_io_error_bh(sdp, bh);
- brelse(bh);
+ sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
+ gfs2_log_write_page(sdp, page);
+ gfs2_log_flush_bio(sdp, rw);
+ log_flush_wait(sdp);
if (sdp->sd_log_tail != tail)
log_pull_tail(sdp, tail);
- else
- gfs2_assert_withdraw(sdp, !pull);
-
- sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
- gfs2_log_incr_head(sdp);
}
/**
@@ -678,15 +638,14 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
gfs2_ordered_write(sdp);
lops_before_commit(sdp);
+ gfs2_log_flush_bio(sdp, WRITE);
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
- log_write_header(sdp, 0, 0);
+ log_write_header(sdp, 0);
} else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
- gfs2_log_lock(sdp);
atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
trace_gfs2_log_blocks(sdp, -1);
- gfs2_log_unlock(sdp);
- log_write_header(sdp, 0, PULL);
+ log_write_header(sdp, 0);
}
lops_after_commit(sdp, ai);
@@ -735,21 +694,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
gfs2_log_unlock(sdp);
}
-static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
-{
- struct list_head *head = &tr->tr_list_buf;
- struct gfs2_bufdata *bd;
-
- gfs2_log_lock(sdp);
- while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
- list_del_init(&bd->bd_list_tr);
- tr->tr_num_buf--;
- }
- gfs2_log_unlock(sdp);
- gfs2_assert_warn(sdp, !tr->tr_num_buf);
-}
-
/**
* gfs2_log_commit - Commit a transaction to the log
* @sdp: the filesystem
@@ -768,8 +712,6 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
log_refund(sdp, tr);
- buf_lo_incore_commit(sdp, tr);
-
up_read(&sdp->sd_log_flush_lock);
if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
@@ -798,8 +740,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
sdp->sd_log_flush_head = sdp->sd_log_head;
sdp->sd_log_flush_wrapped = 0;
- log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT,
- (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL);
+ log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT);
gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
@@ -854,11 +795,9 @@ int gfs2_logd(void *data)
struct gfs2_sbd *sdp = data;
unsigned long t = 1;
DEFINE_WAIT(wait);
- unsigned preflush;
while (!kthread_should_stop()) {
- preflush = atomic_read(&sdp->sd_log_pinned);
if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
gfs2_ail1_empty(sdp);
gfs2_log_flush(sdp, NULL);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index ff07454b582..3fd5215ea25 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -52,8 +52,6 @@ extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
unsigned int ssize);
extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn);
extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6b1efb594d9..852c1be1dd3 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -127,146 +127,277 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
atomic_dec(&sdp->sd_log_pinned);
}
-
-static inline struct gfs2_log_descriptor *bh_log_desc(struct buffer_head *bh)
+static void gfs2_log_incr_head(struct gfs2_sbd *sdp)
{
- return (struct gfs2_log_descriptor *)bh->b_data;
+ BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) &&
+ (sdp->sd_log_flush_head != sdp->sd_log_head));
+
+ if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
+ sdp->sd_log_flush_head = 0;
+ sdp->sd_log_flush_wrapped = 1;
+ }
}
-static inline __be64 *bh_log_ptr(struct buffer_head *bh)
+static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
{
- struct gfs2_log_descriptor *ld = bh_log_desc(bh);
- return (__force __be64 *)(ld + 1);
+ unsigned int lbn = sdp->sd_log_flush_head;
+ struct gfs2_journal_extent *je;
+ u64 block;
+
+ list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
+ if (lbn >= je->lblock && lbn < je->lblock + je->blocks) {
+ block = je->dblock + lbn - je->lblock;
+ gfs2_log_incr_head(sdp);
+ return block;
+ }
+ }
+
+ return -1;
}
-static inline __be64 *bh_ptr_end(struct buffer_head *bh)
+/**
+ * gfs2_end_log_write_bh - end log write of pagecache data with buffers
+ * @sdp: The superblock
+ * @bvec: The bio_vec
+ * @error: The i/o status
+ *
+ * This finds the relavent buffers and unlocks then and sets the
+ * error flag according to the status of the i/o request. This is
+ * used when the log is writing data which has an in-place version
+ * that is pinned in the pagecache.
+ */
+
+static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+ int error)
{
- return (__force __be64 *)(bh->b_data + bh->b_size);
+ struct buffer_head *bh, *next;
+ struct page *page = bvec->bv_page;
+ unsigned size;
+
+ bh = page_buffers(page);
+ size = bvec->bv_len;
+ while (bh_offset(bh) < bvec->bv_offset)
+ bh = bh->b_this_page;
+ do {
+ if (error)
+ set_buffer_write_io_error(bh);
+ unlock_buffer(bh);
+ next = bh->b_this_page;
+ size -= bh->b_size;
+ brelse(bh);
+ bh = next;
+ } while(bh && size);
}
/**
- * gfs2_log_write_endio - End of I/O for a log buffer
- * @bh: The buffer head
- * @uptodate: I/O Status
+ * gfs2_end_log_write - end of i/o to the log
+ * @bio: The bio
+ * @error: Status of i/o request
+ *
+ * Each bio_vec contains either data from the pagecache or data
+ * relating to the log itself. Here we iterate over the bio_vec
+ * array, processing both kinds of data.
*
*/
-static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate)
+static void gfs2_end_log_write(struct bio *bio, int error)
{
- struct gfs2_sbd *sdp = bh->b_private;
- bh->b_private = NULL;
+ struct gfs2_sbd *sdp = bio->bi_private;
+ struct bio_vec *bvec;
+ struct page *page;
+ int i;
- end_buffer_write_sync(bh, uptodate);
+ if (error) {
+ sdp->sd_log_error = error;
+ fs_err(sdp, "Error %d writing to log\n", error);
+ }
+
+ bio_for_each_segment(bvec, bio, i) {
+ page = bvec->bv_page;
+ if (page_has_buffers(page))
+ gfs2_end_log_write_bh(sdp, bvec, error);
+ else
+ mempool_free(page, gfs2_page_pool);
+ }
+
+ bio_put(bio);
if (atomic_dec_and_test(&sdp->sd_log_in_flight))
wake_up(&sdp->sd_log_flush_wait);
}
/**
- * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
- * @sdp: The GFS2 superblock
+ * gfs2_log_flush_bio - Submit any pending log bio
+ * @sdp: The superblock
+ * @rw: The rw flags
*
- * tReturns: the buffer_head
+ * Submit any pending part-built or full bio to the block device. If
+ * there is no pending bio, then this is a no-op.
*/
-static struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
+void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw)
{
- u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head);
- struct buffer_head *bh;
+ if (sdp->sd_log_bio) {
+ atomic_inc(&sdp->sd_log_in_flight);
+ submit_bio(rw, sdp->sd_log_bio);
+ sdp->sd_log_bio = NULL;
+ }
+}
- bh = sb_getblk(sdp->sd_vfs, blkno);
- lock_buffer(bh);
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
- gfs2_log_incr_head(sdp);
- atomic_inc(&sdp->sd_log_in_flight);
- bh->b_private = sdp;
- bh->b_end_io = gfs2_log_write_endio;
+/**
+ * gfs2_log_alloc_bio - Allocate a new bio for log writing
+ * @sdp: The superblock
+ * @blkno: The next device block number we want to write to
+ *
+ * This should never be called when there is a cached bio in the
+ * super block. When it returns, there will be a cached bio in the
+ * super block which will have as many bio_vecs as the device is
+ * happy to handle.
+ *
+ * Returns: Newly allocated bio
+ */
- return bh;
+static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
+{
+ struct super_block *sb = sdp->sd_vfs;
+ unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev);
+ struct bio *bio;
+
+ BUG_ON(sdp->sd_log_bio);
+
+ while (1) {
+ bio = bio_alloc(GFP_NOIO, nrvecs);
+ if (likely(bio))
+ break;
+ nrvecs = max(nrvecs/2, 1U);
+ }
+
+ bio->bi_sector = blkno * (sb->s_blocksize >> 9);
+ bio->bi_bdev = sb->s_bdev;
+ bio->bi_end_io = gfs2_end_log_write;
+ bio->bi_private = sdp;
+
+ sdp->sd_log_bio = bio;
+
+ return bio;
}
/**
- * gfs2_fake_write_endio -
- * @bh: The buffer head
- * @uptodate: The I/O Status
+ * gfs2_log_get_bio - Get cached log bio, or allocate a new one
+ * @sdp: The superblock
+ * @blkno: The device block number we want to write to
+ *
+ * If there is a cached bio, then if the next block number is sequential
+ * with the previous one, return it, otherwise flush the bio to the
+ * device. If there is not a cached bio, or we just flushed it, then
+ * allocate a new one.
*
+ * Returns: The bio to use for log writes
*/
-static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate)
+static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
{
- struct buffer_head *real_bh = bh->b_private;
- struct gfs2_bufdata *bd = real_bh->b_private;
- struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd;
+ struct bio *bio = sdp->sd_log_bio;
+ u64 nblk;
+
+ if (bio) {
+ nblk = bio->bi_sector + bio_sectors(bio);
+ nblk >>= sdp->sd_fsb2bb_shift;
+ if (blkno == nblk)
+ return bio;
+ gfs2_log_flush_bio(sdp, WRITE);
+ }
- end_buffer_write_sync(bh, uptodate);
- mempool_free(bh, gfs2_bh_pool);
- unlock_buffer(real_bh);
- brelse(real_bh);
- if (atomic_dec_and_test(&sdp->sd_log_in_flight))
- wake_up(&sdp->sd_log_flush_wait);
+ return gfs2_log_alloc_bio(sdp, blkno);
}
+
/**
- * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
+ * gfs2_log_write - write to log
* @sdp: the filesystem
- * @data: the data the buffer_head should point to
+ * @page: the page to write
+ * @size: the size of the data to write
+ * @offset: the offset within the page
*
- * Returns: the log buffer descriptor
+ * Try and add the page segment to the current bio. If that fails,
+ * submit the current bio to the device and create a new one, and
+ * then add the page segment to that.
*/
-static struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
- struct buffer_head *real)
+static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
+ unsigned size, unsigned offset)
{
- u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head);
- struct buffer_head *bh;
+ u64 blkno = gfs2_log_bmap(sdp);
+ struct bio *bio;
+ int ret;
+
+ bio = gfs2_log_get_bio(sdp, blkno);
+ ret = bio_add_page(bio, page, size, offset);
+ if (ret == 0) {
+ gfs2_log_flush_bio(sdp, WRITE);
+ bio = gfs2_log_alloc_bio(sdp, blkno);
+ ret = bio_add_page(bio, page, size, offset);
+ WARN_ON(ret == 0);
+ }
+}
+
+/**
+ * gfs2_log_write_bh - write a buffer's content to the log
+ * @sdp: The super block
+ * @bh: The buffer pointing to the in-place location
+ *
+ * This writes the content of the buffer to the next available location
+ * in the log. The buffer will be unlocked once the i/o to the log has
+ * completed.
+ */
- bh = mempool_alloc(gfs2_bh_pool, GFP_NOFS);
- atomic_set(&bh->b_count, 1);
- bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
- set_bh_page(bh, real->b_page, bh_offset(real));
- bh->b_blocknr = blkno;
- bh->b_size = sdp->sd_sb.sb_bsize;
- bh->b_bdev = sdp->sd_vfs->s_bdev;
- bh->b_private = real;
- bh->b_end_io = gfs2_fake_write_endio;
+static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+ gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh));
+}
- gfs2_log_incr_head(sdp);
- atomic_inc(&sdp->sd_log_in_flight);
+/**
+ * gfs2_log_write_page - write one block stored in a page, into the log
+ * @sdp: The superblock
+ * @page: The struct page
+ *
+ * This writes the first block-sized part of the page into the log. Note
+ * that the page must have been allocated from the gfs2_page_pool mempool
+ * and that after this has been called, ownership has been transferred and
+ * the page may be freed at any time.
+ */
- return bh;
+void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page)
+{
+ struct super_block *sb = sdp->sd_vfs;
+ gfs2_log_write(sdp, page, sb->s_blocksize, 0);
}
-static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
+static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
+ u32 ld_length, u32 ld_data1)
{
- struct buffer_head *bh = gfs2_log_get_buf(sdp);
- struct gfs2_log_descriptor *ld = bh_log_desc(bh);
+ struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+ struct gfs2_log_descriptor *ld = page_address(page);
+ clear_page(ld);
ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
ld->ld_type = cpu_to_be32(ld_type);
- ld->ld_length = 0;
- ld->ld_data1 = 0;
+ ld->ld_length = cpu_to_be32(ld_length);
+ ld->ld_data1 = cpu_to_be32(ld_data1);
ld->ld_data2 = 0;
- memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
- return bh;
+ return page;
}
-static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
- struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
struct gfs2_meta_header *mh;
struct gfs2_trans *tr;
lock_buffer(bd->bd_bh);
gfs2_log_lock(sdp);
- if (!list_empty(&bd->bd_list_tr))
- goto out;
tr = current->journal_info;
tr->tr_touched = 1;
- tr->tr_num_buf++;
- list_add(&bd->bd_list_tr, &tr->tr_list_buf);
- if (!list_empty(&le->le_list))
+ if (!list_empty(&bd->bd_list))
goto out;
set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
@@ -276,62 +407,86 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
mh->__pad0 = cpu_to_be64(0);
mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
sdp->sd_log_num_buf++;
- list_add(&le->le_list, &sdp->sd_log_le_buf);
+ list_add(&bd->bd_list, &sdp->sd_log_le_buf);
tr->tr_num_buf_new++;
out:
gfs2_log_unlock(sdp);
unlock_buffer(bd->bd_bh);
}
-static void buf_lo_before_commit(struct gfs2_sbd *sdp)
+static void gfs2_check_magic(struct buffer_head *bh)
+{
+ void *kaddr;
+ __be32 *ptr;
+
+ clear_buffer_escaped(bh);
+ kaddr = kmap_atomic(bh->b_page);
+ ptr = kaddr + bh_offset(bh);
+ if (*ptr == cpu_to_be32(GFS2_MAGIC))
+ set_buffer_escaped(bh);
+ kunmap_atomic(kaddr);
+}
+
+static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
+ unsigned int total, struct list_head *blist,
+ bool is_databuf)
{
- struct buffer_head *bh;
struct gfs2_log_descriptor *ld;
struct gfs2_bufdata *bd1 = NULL, *bd2;
- unsigned int total;
- unsigned int limit;
+ struct page *page;
unsigned int num;
unsigned n;
__be64 *ptr;
- limit = buf_limit(sdp);
- /* for 4k blocks, limit = 503 */
-
gfs2_log_lock(sdp);
- total = sdp->sd_log_num_buf;
- bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
+ bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
while(total) {
num = total;
if (total > limit)
num = limit;
gfs2_log_unlock(sdp);
- bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA);
+ page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num);
+ ld = page_address(page);
gfs2_log_lock(sdp);
- ld = bh_log_desc(bh);
- ptr = bh_log_ptr(bh);
- ld->ld_length = cpu_to_be32(num + 1);
- ld->ld_data1 = cpu_to_be32(num);
+ ptr = (__be64 *)(ld + 1);
n = 0;
- list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
- bd_le.le_list) {
+ list_for_each_entry_continue(bd1, blist, bd_list) {
*ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+ if (is_databuf) {
+ gfs2_check_magic(bd1->bd_bh);
+ *ptr++ = cpu_to_be64(buffer_escaped(bd1->bd_bh) ? 1 : 0);
+ }
if (++n >= num)
break;
}
gfs2_log_unlock(sdp);
- submit_bh(WRITE_SYNC, bh);
+ gfs2_log_write_page(sdp, page);
gfs2_log_lock(sdp);
n = 0;
- list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
- bd_le.le_list) {
+ list_for_each_entry_continue(bd2, blist, bd_list) {
get_bh(bd2->bd_bh);
gfs2_log_unlock(sdp);
lock_buffer(bd2->bd_bh);
- bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
- submit_bh(WRITE_SYNC, bh);
+
+ if (buffer_escaped(bd2->bd_bh)) {
+ void *kaddr;
+ page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+ ptr = page_address(page);
+ kaddr = kmap_atomic(bd2->bd_bh->b_page);
+ memcpy(ptr, kaddr + bh_offset(bd2->bd_bh),
+ bd2->bd_bh->b_size);
+ kunmap_atomic(kaddr);
+ *(__be32 *)ptr = 0;
+ clear_buffer_escaped(bd2->bd_bh);
+ unlock_buffer(bd2->bd_bh);
+ brelse(bd2->bd_bh);
+ gfs2_log_write_page(sdp, page);
+ } else {
+ gfs2_log_write_bh(sdp, bd2->bd_bh);
+ }
gfs2_log_lock(sdp);
if (++n >= num)
break;
@@ -343,14 +498,22 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
gfs2_log_unlock(sdp);
}
+static void buf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+ unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */
+
+ gfs2_before_commit(sdp, limit, sdp->sd_log_num_buf,
+ &sdp->sd_log_le_buf, 0);
+}
+
static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
{
struct list_head *head = &sdp->sd_log_le_buf;
struct gfs2_bufdata *bd;
while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
- list_del_init(&bd->bd_le.le_list);
+ bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+ list_del_init(&bd->bd_list);
sdp->sd_log_num_buf--;
gfs2_unpin(sdp, bd->bd_bh, ai);
@@ -437,9 +600,8 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
}
-static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
- struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
struct gfs2_glock *gl = bd->bd_gl;
struct gfs2_trans *tr;
@@ -449,48 +611,48 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
sdp->sd_log_num_revoke++;
atomic_inc(&gl->gl_revokes);
set_bit(GLF_LFLUSH, &gl->gl_flags);
- list_add(&le->le_list, &sdp->sd_log_le_revoke);
+ list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
}
static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
{
struct gfs2_log_descriptor *ld;
struct gfs2_meta_header *mh;
- struct buffer_head *bh;
unsigned int offset;
struct list_head *head = &sdp->sd_log_le_revoke;
struct gfs2_bufdata *bd;
+ struct page *page;
+ unsigned int length;
if (!sdp->sd_log_num_revoke)
return;
- bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE);
- ld = bh_log_desc(bh);
- ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
- sizeof(u64)));
- ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
+ length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64));
+ page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke);
+ ld = page_address(page);
offset = sizeof(struct gfs2_log_descriptor);
- list_for_each_entry(bd, head, bd_le.le_list) {
+ list_for_each_entry(bd, head, bd_list) {
sdp->sd_log_num_revoke--;
if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
- submit_bh(WRITE_SYNC, bh);
- bh = gfs2_log_get_buf(sdp);
- mh = (struct gfs2_meta_header *)bh->b_data;
+ gfs2_log_write_page(sdp, page);
+ page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+ mh = page_address(page);
+ clear_page(mh);
mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
offset = sizeof(struct gfs2_meta_header);
}
- *(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno);
+ *(__be64 *)(page_address(page) + offset) = cpu_to_be64(bd->bd_blkno);
offset += sizeof(u64);
}
gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
- submit_bh(WRITE_SYNC, bh);
+ gfs2_log_write_page(sdp, page);
}
static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
@@ -500,8 +662,8 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
struct gfs2_glock *gl;
while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
- list_del_init(&bd->bd_le.le_list);
+ bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+ list_del_init(&bd->bd_list);
gl = bd->bd_gl;
atomic_dec(&gl->gl_revokes);
clear_bit(GLF_LFLUSH, &gl->gl_flags);
@@ -604,108 +766,33 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
* blocks, which isn't an enormous overhead but twice as much as
* for normal metadata blocks.
*/
-static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
- struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
struct gfs2_trans *tr = current->journal_info;
struct address_space *mapping = bd->bd_bh->b_page->mapping;
struct gfs2_inode *ip = GFS2_I(mapping->host);
lock_buffer(bd->bd_bh);
gfs2_log_lock(sdp);
- if (tr) {
- if (!list_empty(&bd->bd_list_tr))
- goto out;
+ if (tr)
tr->tr_touched = 1;
- if (gfs2_is_jdata(ip)) {
- tr->tr_num_buf++;
- list_add(&bd->bd_list_tr, &tr->tr_list_buf);
- }
- }
- if (!list_empty(&le->le_list))
+ if (!list_empty(&bd->bd_list))
goto out;
-
set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
if (gfs2_is_jdata(ip)) {
gfs2_pin(sdp, bd->bd_bh);
tr->tr_num_databuf_new++;
sdp->sd_log_num_databuf++;
- list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
+ list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
} else {
- list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
+ list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
}
out:
gfs2_log_unlock(sdp);
unlock_buffer(bd->bd_bh);
}
-static void gfs2_check_magic(struct buffer_head *bh)
-{
- void *kaddr;
- __be32 *ptr;
-
- clear_buffer_escaped(bh);
- kaddr = kmap_atomic(bh->b_page);
- ptr = kaddr + bh_offset(bh);
- if (*ptr == cpu_to_be32(GFS2_MAGIC))
- set_buffer_escaped(bh);
- kunmap_atomic(kaddr);
-}
-
-static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
- struct list_head *list, struct list_head *done,
- unsigned int n)
-{
- struct buffer_head *bh1;
- struct gfs2_log_descriptor *ld;
- struct gfs2_bufdata *bd;
- __be64 *ptr;
-
- if (!bh)
- return;
-
- ld = bh_log_desc(bh);
- ld->ld_length = cpu_to_be32(n + 1);
- ld->ld_data1 = cpu_to_be32(n);
-
- ptr = bh_log_ptr(bh);
-
- get_bh(bh);
- submit_bh(WRITE_SYNC, bh);
- gfs2_log_lock(sdp);
- while(!list_empty(list)) {
- bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
- list_move_tail(&bd->bd_le.le_list, done);
- get_bh(bd->bd_bh);
- while (be64_to_cpu(*ptr) != bd->bd_bh->b_blocknr) {
- gfs2_log_incr_head(sdp);
- ptr += 2;
- }
- gfs2_log_unlock(sdp);
- lock_buffer(bd->bd_bh);
- if (buffer_escaped(bd->bd_bh)) {
- void *kaddr;
- bh1 = gfs2_log_get_buf(sdp);
- kaddr = kmap_atomic(bd->bd_bh->b_page);
- memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh),
- bh1->b_size);
- kunmap_atomic(kaddr);
- *(__be32 *)bh1->b_data = 0;
- clear_buffer_escaped(bd->bd_bh);
- unlock_buffer(bd->bd_bh);
- brelse(bd->bd_bh);
- } else {
- bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
- }
- submit_bh(WRITE_SYNC, bh1);
- gfs2_log_lock(sdp);
- ptr += 2;
- }
- gfs2_log_unlock(sdp);
- brelse(bh);
-}
-
/**
* databuf_lo_before_commit - Scan the data buffers, writing as we go
*
@@ -713,37 +800,10 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
{
- struct gfs2_bufdata *bd = NULL;
- struct buffer_head *bh = NULL;
- unsigned int n = 0;
- __be64 *ptr = NULL, *end = NULL;
- LIST_HEAD(processed);
- LIST_HEAD(in_progress);
+ unsigned int limit = buf_limit(sdp) / 2;
- gfs2_log_lock(sdp);
- while (!list_empty(&sdp->sd_log_le_databuf)) {
- if (ptr == end) {
- gfs2_log_unlock(sdp);
- gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
- n = 0;
- bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_JDATA);
- ptr = bh_log_ptr(bh);
- end = bh_ptr_end(bh) - 1;
- gfs2_log_lock(sdp);
- continue;
- }
- bd = list_entry(sdp->sd_log_le_databuf.next, struct gfs2_bufdata, bd_le.le_list);
- list_move_tail(&bd->bd_le.le_list, &in_progress);
- gfs2_check_magic(bd->bd_bh);
- *ptr++ = cpu_to_be64(bd->bd_bh->b_blocknr);
- *ptr++ = cpu_to_be64(buffer_escaped(bh) ? 1 : 0);
- n++;
- }
- gfs2_log_unlock(sdp);
- gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
- gfs2_log_lock(sdp);
- list_splice(&processed, &sdp->sd_log_le_databuf);
- gfs2_log_unlock(sdp);
+ gfs2_before_commit(sdp, limit, sdp->sd_log_num_databuf,
+ &sdp->sd_log_le_databuf, 1);
}
static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -822,8 +882,8 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
struct gfs2_bufdata *bd;
while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
- list_del_init(&bd->bd_le.le_list);
+ bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+ list_del_init(&bd->bd_list);
sdp->sd_log_num_databuf--;
gfs2_unpin(sdp, bd->bd_bh, ai);
}
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 3c0b2737658..954a330585f 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -27,6 +27,8 @@ extern const struct gfs2_log_operations gfs2_rg_lops;
extern const struct gfs2_log_operations gfs2_databuf_lops;
extern const struct gfs2_log_operations *gfs2_log_ops[];
+extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
+extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
@@ -44,17 +46,17 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
return limit;
}
-static inline void lops_init_le(struct gfs2_log_element *le,
+static inline void lops_init_le(struct gfs2_bufdata *bd,
const struct gfs2_log_operations *lops)
{
- INIT_LIST_HEAD(&le->le_list);
- le->le_ops = lops;
+ INIT_LIST_HEAD(&bd->bd_list);
+ bd->bd_ops = lops;
}
-static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
- if (le->le_ops->lo_add)
- le->le_ops->lo_add(sdp, le);
+ if (bd->bd_ops->lo_add)
+ bd->bd_ops->lo_add(sdp, bd);
}
static inline void lops_before_commit(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 754426b1e52..6cdb0f2a1b0 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -70,16 +70,6 @@ static void gfs2_init_gl_aspace_once(void *foo)
address_space_init_once(mapping);
}
-static void *gfs2_bh_alloc(gfp_t mask, void *data)
-{
- return alloc_buffer_head(mask);
-}
-
-static void gfs2_bh_free(void *ptr, void *data)
-{
- return free_buffer_head(ptr);
-}
-
/**
* init_gfs2_fs - Register GFS2 as a filesystem
*
@@ -143,6 +133,12 @@ static int __init init_gfs2_fs(void)
if (!gfs2_quotad_cachep)
goto fail;
+ gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk",
+ sizeof(struct gfs2_blkreserv),
+ 0, 0, NULL);
+ if (!gfs2_rsrv_cachep)
+ goto fail;
+
register_shrinker(&qd_shrinker);
error = register_filesystem(&gfs2_fs_type);
@@ -164,8 +160,8 @@ static int __init init_gfs2_fs(void)
if (!gfs2_control_wq)
goto fail_recovery;
- gfs2_bh_pool = mempool_create(1024, gfs2_bh_alloc, gfs2_bh_free, NULL);
- if (!gfs2_bh_pool)
+ gfs2_page_pool = mempool_create_page_pool(64, 0);
+ if (!gfs2_page_pool)
goto fail_control;
gfs2_register_debugfs();
@@ -186,6 +182,9 @@ fail:
unregister_shrinker(&qd_shrinker);
gfs2_glock_exit();
+ if (gfs2_rsrv_cachep)
+ kmem_cache_destroy(gfs2_rsrv_cachep);
+
if (gfs2_quotad_cachep)
kmem_cache_destroy(gfs2_quotad_cachep);
@@ -225,7 +224,8 @@ static void __exit exit_gfs2_fs(void)
rcu_barrier();
- mempool_destroy(gfs2_bh_pool);
+ mempool_destroy(gfs2_page_pool);
+ kmem_cache_destroy(gfs2_rsrv_cachep);
kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 181586e673f..6c1e5d1c404 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -293,11 +293,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
bd->bd_bh = bh;
bd->bd_gl = gl;
- INIT_LIST_HEAD(&bd->bd_list_tr);
if (meta)
- lops_init_le(&bd->bd_le, &gfs2_buf_lops);
+ lops_init_le(bd, &gfs2_buf_lops);
else
- lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
+ lops_init_le(bd, &gfs2_databuf_lops);
bh->b_private = bd;
if (meta)
@@ -313,7 +312,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
if (test_clear_buffer_pinned(bh)) {
trace_gfs2_pin(bd, 0);
atomic_dec(&sdp->sd_log_pinned);
- list_del_init(&bd->bd_le.le_list);
+ list_del_init(&bd->bd_list);
if (meta) {
gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
sdp->sd_log_num_buf--;
@@ -375,33 +374,24 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
* @ip: The GFS2 inode
* @height: The level of this buf in the metadata (indir addr) tree (if any)
* @num: The block number (device relative) of the buffer
- * @new: Non-zero if we may create a new buffer
* @bhp: the buffer is returned here
*
* Returns: errno
*/
int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
- int new, struct buffer_head **bhp)
+ struct buffer_head **bhp)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_glock *gl = ip->i_gl;
struct buffer_head *bh;
int ret = 0;
+ u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
- if (new) {
- BUG_ON(height == 0);
- bh = gfs2_meta_new(gl, num);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
- gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
- } else {
- u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
- ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
- if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
- brelse(bh);
- ret = -EIO;
- }
+ ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+ if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
+ brelse(bh);
+ ret = -EIO;
}
*bhp = bh;
return ret;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 22c52659313..c30973b07a7 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -65,12 +65,12 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
- int new, struct buffer_head **bhp);
+ struct buffer_head **bhp);
static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
struct buffer_head **bhp)
{
- return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, 0, bhp);
+ return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, bhp);
}
struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 6f3a18f9e17..b8c250fc492 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -99,7 +99,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
atomic_set(&sdp->sd_log_pinned, 0);
INIT_LIST_HEAD(&sdp->sd_log_le_buf);
INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
- INIT_LIST_HEAD(&sdp->sd_log_le_rg);
INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
@@ -994,6 +993,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
ls->ls_jid = option;
break;
case Opt_id:
+ case Opt_nodir:
/* Obsolete, but left for backward compat purposes */
break;
case Opt_first:
@@ -1002,12 +1002,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
goto hostdata_error;
ls->ls_first = option;
break;
- case Opt_nodir:
- ret = match_int(&tmp[0], &option);
- if (ret || (option != 0 && option != 1))
- goto hostdata_error;
- ls->ls_nodir = option;
- break;
case Opt_err:
default:
hostdata_error:
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6019da3dcae..b97178e7d39 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -652,7 +652,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
}
memset(&q, 0, sizeof(struct gfs2_quota));
- err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
+ err = gfs2_internal_read(ip, (char *)&q, &loc, sizeof(q));
if (err < 0)
return err;
@@ -744,7 +744,7 @@ get_a_page:
i_size_write(inode, size);
inode->i_mtime = inode->i_atime = CURRENT_TIME;
mark_inode_dirty(inode);
- return err;
+ return 0;
unlock_out:
unlock_page(page);
@@ -852,7 +852,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
memset(&q, 0, sizeof(struct gfs2_quota));
pos = qd2offset(qd);
- error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q));
+ error = gfs2_internal_read(ip, (char *)&q, &pos, sizeof(q));
if (error < 0)
return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3df65c9ab73..f74fb9bd197 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -70,15 +70,15 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
/**
* gfs2_setbit - Set a bit in the bitmaps
- * @buffer: the buffer that holds the bitmaps
- * @buflen: the length (in bytes) of the buffer
+ * @rgd: the resource group descriptor
+ * @buf2: the clone buffer that holds the bitmaps
+ * @bi: the bitmap structure
* @block: the block to set
* @new_state: the new state of the block
*
*/
-static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
- unsigned char *buf2, unsigned int offset,
+static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf2,
struct gfs2_bitmap *bi, u32 block,
unsigned char new_state)
{
@@ -86,8 +86,8 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
unsigned int buflen = bi->bi_len;
const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
- byte1 = buf1 + offset + (block / GFS2_NBBY);
- end = buf1 + offset + buflen;
+ byte1 = bi->bi_bh->b_data + bi->bi_offset + (block / GFS2_NBBY);
+ end = bi->bi_bh->b_data + bi->bi_offset + buflen;
BUG_ON(byte1 >= end);
@@ -110,7 +110,7 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
*byte1 ^= (cur_state ^ new_state) << bit;
if (buf2) {
- byte2 = buf2 + offset + (block / GFS2_NBBY);
+ byte2 = buf2 + bi->bi_offset + (block / GFS2_NBBY);
cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
*byte2 ^= (cur_state ^ new_state) << bit;
}
@@ -118,6 +118,7 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
/**
* gfs2_testbit - test a bit in the bitmaps
+ * @rgd: the resource group descriptor
* @buffer: the buffer that holds the bitmaps
* @buflen: the length (in bytes) of the buffer
* @block: the block to read
@@ -179,7 +180,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
/**
* gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
* a block in a given allocation state.
- * @buffer: the buffer that holds the bitmaps
+ * @buf: the buffer that holds the bitmaps
* @len: the length (in bytes) of the buffer
* @goal: start search at this block's bit-pair (within @buffer)
* @state: GFS2_BLKST_XXX the state of the block we're looking for.
@@ -231,6 +232,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
/**
* gfs2_bitcount - count the number of bits in a certain state
+ * @rgd: the resource group descriptor
* @buffer: the buffer that holds the bitmaps
* @buflen: the length (in bytes) of the buffer
* @state: the state of the block we're looking for
@@ -264,7 +266,6 @@ static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
/**
* gfs2_rgrp_verify - Verify that a resource group is consistent
- * @sdp: the filesystem
* @rgd: the rgrp
*
*/
@@ -322,7 +323,8 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
/**
* gfs2_blk2rgrpd - Find resource group for a given data/meta block number
* @sdp: The GFS2 superblock
- * @n: The data block number
+ * @blk: The data block number
+ * @exact: True if this needs to be an exact match
*
* Returns: The resource group, or NULL if not found
*/
@@ -380,7 +382,7 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
/**
* gfs2_rgrpd_get_next - get the next RG
- * @rgd: A RG
+ * @rgd: the resource group descriptor
*
* Returns: The next rgrp
*/
@@ -529,6 +531,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
/**
* gfs2_ri_total - Total up the file system space, according to the rindex.
+ * @sdp: the filesystem
*
*/
u64 gfs2_ri_total(struct gfs2_sbd *sdp)
@@ -537,16 +540,14 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
struct inode *inode = sdp->sd_rindex;
struct gfs2_inode *ip = GFS2_I(inode);
char buf[sizeof(struct gfs2_rindex)];
- struct file_ra_state ra_state;
int error, rgrps;
- file_ra_state_init(&ra_state, inode->i_mapping);
for (rgrps = 0;; rgrps++) {
loff_t pos = rgrps * sizeof(struct gfs2_rindex);
if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
break;
- error = gfs2_internal_read(ip, &ra_state, buf, &pos,
+ error = gfs2_internal_read(ip, buf, &pos,
sizeof(struct gfs2_rindex));
if (error != sizeof(struct gfs2_rindex))
break;
@@ -582,13 +583,12 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
/**
* read_rindex_entry - Pull in a new resource index entry from the disk
- * @gl: The glock covering the rindex inode
+ * @ip: Pointer to the rindex inode
*
* Returns: 0 on success, > 0 on EOF, error code otherwise
*/
-static int read_rindex_entry(struct gfs2_inode *ip,
- struct file_ra_state *ra_state)
+static int read_rindex_entry(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
@@ -599,7 +599,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
if (pos >= i_size_read(&ip->i_inode))
return 1;
- error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos,
+ error = gfs2_internal_read(ip, (char *)&buf, &pos,
sizeof(struct gfs2_rindex));
if (error != sizeof(struct gfs2_rindex))
@@ -655,13 +655,10 @@ fail:
static int gfs2_ri_update(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct inode *inode = &ip->i_inode;
- struct file_ra_state ra_state;
int error;
- file_ra_state_init(&ra_state, inode->i_mapping);
do {
- error = read_rindex_entry(ip, &ra_state);
+ error = read_rindex_entry(ip);
} while (error == 0);
if (error < 0)
@@ -741,7 +738,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
/**
* gfs2_rgrp_go_lock - Read in a RG's header and bitmaps
- * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ * @gh: The glock holder for the resource group
*
* Read in all of a Resource Group's header and bitmap blocks.
* Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
@@ -801,7 +798,7 @@ fail:
/**
* gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
- * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ * @gh: The glock holder for the resource group
*
*/
@@ -1002,11 +999,13 @@ struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip)
* Returns: the struct gfs2_qadata
*/
-static struct gfs2_blkreserv *gfs2_blkrsv_get(struct gfs2_inode *ip)
+static int gfs2_blkrsv_get(struct gfs2_inode *ip)
{
BUG_ON(ip->i_res != NULL);
- ip->i_res = kzalloc(sizeof(struct gfs2_blkreserv), GFP_NOFS);
- return ip->i_res;
+ ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
+ if (!ip->i_res)
+ return -ENOMEM;
+ return 0;
}
/**
@@ -1038,6 +1037,8 @@ static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
/**
* try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
* @rgd: The rgrp
+ * @last_unlinked: block address of the last dinode we unlinked
+ * @skip: block address we should explicitly not unlink
*
* Returns: 0 if no error
* The inode, if one has been found, in inode.
@@ -1102,7 +1103,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
/**
* get_local_rgrp - Choose and lock a rgrp for allocation
* @ip: the inode to reserve space for
- * @rgp: the chosen and locked rgrp
+ * @last_unlinked: the last unlinked block
*
* Try to acquire rgrp in way which avoids contending with others.
*
@@ -1164,13 +1165,14 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
static void gfs2_blkrsv_put(struct gfs2_inode *ip)
{
BUG_ON(ip->i_res == NULL);
- kfree(ip->i_res);
+ kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
ip->i_res = NULL;
}
/**
* gfs2_inplace_reserve - Reserve space in the filesystem
* @ip: the inode to reserve space for
+ * @requested: the number of blocks to be reserved
*
* Returns: errno
*/
@@ -1179,14 +1181,15 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_blkreserv *rs;
- int error = 0;
+ int error;
u64 last_unlinked = NO_BLOCK;
int tries = 0;
- rs = gfs2_blkrsv_get(ip);
- if (!rs)
- return -ENOMEM;
+ error = gfs2_blkrsv_get(ip);
+ if (error)
+ return error;
+ rs = ip->i_res;
rs->rs_requested = requested;
if (gfs2_assert_warn(sdp, requested)) {
error = -EINVAL;
@@ -1268,7 +1271,6 @@ static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
* @rgd: the resource group descriptor
* @goal: the goal block within the RG (start here to search for avail block)
* @state: GFS2_BLKST_XXX the before-allocation state to find
- * @dinode: TRUE if the first block we allocate is for a dinode
* @rbi: address of the pointer to the bitmap containing the block found
*
* Walk rgrp's bitmap to find bits that represent a block in @state.
@@ -1282,13 +1284,12 @@ static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
* Returns: the block number found relative to the bitmap rbi
*/
-static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
- unsigned char state,
+static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, unsigned char state,
struct gfs2_bitmap **rbi)
{
struct gfs2_bitmap *bi = NULL;
const u32 length = rgd->rd_length;
- u32 blk = BFITNOENT;
+ u32 biblk = BFITNOENT;
unsigned int buf, x;
const u8 *buffer = NULL;
@@ -1325,8 +1326,8 @@ do_search:
if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
buffer = bi->bi_clone + bi->bi_offset;
- blk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
- if (blk != BFITNOENT)
+ biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+ if (biblk != BFITNOENT)
break;
if ((goal == 0) && (state == GFS2_BLKST_FREE))
@@ -1339,10 +1340,10 @@ skip:
goal = 0;
}
- if (blk != BFITNOENT)
+ if (biblk != BFITNOENT)
*rbi = bi;
- return blk;
+ return biblk;
}
/**
@@ -1367,8 +1368,8 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
*n = 0;
buffer = bi->bi_bh->b_data + bi->bi_offset;
gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
- gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
- bi, blk, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
+ gfs2_setbit(rgd, bi->bi_clone, bi, blk,
+ dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
(*n)++;
goal = blk;
while (*n < elen) {
@@ -1378,8 +1379,7 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
GFS2_BLKST_FREE)
break;
- gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
- bi, goal, GFS2_BLKST_USED);
+ gfs2_setbit(rgd, bi->bi_clone, bi, goal, GFS2_BLKST_USED);
(*n)++;
}
blk = gfs2_bi2rgd_blk(bi, blk);
@@ -1436,8 +1436,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
bi->bi_len);
}
gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
- gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
- bi, buf_blk, new_state);
+ gfs2_setbit(rgd, NULL, bi, buf_blk, new_state);
}
return rgd;
@@ -1557,7 +1556,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
ip->i_inode.i_gid);
rgd->rd_free_clone -= *nblocks;
- trace_gfs2_block_alloc(ip, block, *nblocks,
+ trace_gfs2_block_alloc(ip, rgd, block, *nblocks,
dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
*bn = block;
return 0;
@@ -1584,7 +1583,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
if (!rgd)
return;
- trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
+ trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
rgd->rd_free += blen;
rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1622,7 +1621,7 @@ void gfs2_unlink_di(struct inode *inode)
rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
if (!rgd)
return;
- trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
+ trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
}
@@ -1652,7 +1651,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
{
gfs2_free_uninit_di(rgd, ip->i_no_addr);
- trace_gfs2_block_alloc(ip, ip->i_no_addr, 1, GFS2_BLKST_FREE);
+ trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
gfs2_meta_wipe(ip, ip->i_no_addr, 1);
}
@@ -1752,7 +1751,6 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
* and initialize an array of glock holders for them
* @rlist: the list of resource groups
* @state: the lock state to acquire the RG lock in
- * @flags: the modifier flags for the holder structures
*
* FIXME: Don't use NOFAIL
*
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d33172c291b..9c2592b1d5f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -368,10 +368,7 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
struct gfs2_jdesc *jd;
int rv;
- rv = -ESHUTDOWN;
spin_lock(&sdp->sd_jindex_spin);
- if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
- goto out;
rv = -EBUSY;
if (sdp->sd_jdesc->jd_jid == jid)
goto out;
@@ -396,8 +393,13 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
if (rv != 1)
return -EINVAL;
- rv = gfs2_recover_set(sdp, jid);
+ if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) {
+ rv = -ESHUTDOWN;
+ goto out;
+ }
+ rv = gfs2_recover_set(sdp, jid);
+out:
return rv ? rv : len;
}
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index dfa89cd7553..1b8b8158819 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -457,10 +457,10 @@ TRACE_EVENT(gfs2_bmap,
/* Keep track of blocks as they are allocated/freed */
TRACE_EVENT(gfs2_block_alloc,
- TP_PROTO(const struct gfs2_inode *ip, u64 block, unsigned len,
- u8 block_state),
+ TP_PROTO(const struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+ u64 block, unsigned len, u8 block_state),
- TP_ARGS(ip, block, len, block_state),
+ TP_ARGS(ip, rgd, block, len, block_state),
TP_STRUCT__entry(
__field( dev_t, dev )
@@ -468,6 +468,8 @@ TRACE_EVENT(gfs2_block_alloc,
__field( u64, inum )
__field( u32, len )
__field( u8, block_state )
+ __field( u64, rd_addr )
+ __field( u32, rd_free_clone )
),
TP_fast_assign(
@@ -476,14 +478,18 @@ TRACE_EVENT(gfs2_block_alloc,
__entry->inum = ip->i_no_addr;
__entry->len = len;
__entry->block_state = block_state;
+ __entry->rd_addr = rgd->rd_addr;
+ __entry->rd_free_clone = rgd->rd_free_clone;
),
- TP_printk("%u,%u bmap %llu alloc %llu/%lu %s",
+ TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->inum,
(unsigned long long)__entry->start,
(unsigned long)__entry->len,
- block_state_name(__entry->block_state))
+ block_state_name(__entry->block_state),
+ (unsigned long long)__entry->rd_addr,
+ __entry->rd_free_clone)
);
#endif /* _TRACE_GFS2_H */
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 86ac75d99d3..ad3e2fb763d 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -50,8 +50,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
if (revokes)
tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
sizeof(u64));
- INIT_LIST_HEAD(&tr->tr_list_buf);
-
gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
error = gfs2_glock_nq(&tr->tr_t_gh);
@@ -93,10 +91,21 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
up_read(&sdp->sd_log_flush_lock);
}
+static void gfs2_print_trans(const struct gfs2_trans *tr)
+{
+ print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+ printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n",
+ tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
+ printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
+ tr->tr_num_buf_new, tr->tr_num_buf_rm,
+ tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
+ tr->tr_num_revoke, tr->tr_num_revoke_rm);
+}
+
void gfs2_trans_end(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr = current->journal_info;
-
+ s64 nbuf;
BUG_ON(!tr);
current->journal_info = NULL;
@@ -110,16 +119,13 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
return;
}
- if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
- fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
- tr->tr_num_buf, tr->tr_blocks);
- print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
- }
- if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
- fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
- tr->tr_num_revoke, tr->tr_revokes);
- print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
- }
+ nbuf = tr->tr_num_buf_new + tr->tr_num_databuf_new;
+ nbuf -= tr->tr_num_buf_rm;
+ nbuf -= tr->tr_num_databuf_rm;
+
+ if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) &&
+ (tr->tr_num_revoke <= tr->tr_revokes)))
+ gfs2_print_trans(tr);
gfs2_log_commit(sdp, tr);
if (tr->tr_t_gh.gh_gl) {
@@ -152,16 +158,16 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
gfs2_attach_bufdata(gl, bh, meta);
bd = bh->b_private;
}
- lops_add(sdp, &bd->bd_le);
+ lops_add(sdp, bd);
}
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
- BUG_ON(!list_empty(&bd->bd_le.le_list));
+ BUG_ON(!list_empty(&bd->bd_list));
BUG_ON(!list_empty(&bd->bd_ail_st_list));
BUG_ON(!list_empty(&bd->bd_ail_gl_list));
- lops_init_le(&bd->bd_le, &gfs2_revoke_lops);
- lops_add(sdp, &bd->bd_le);
+ lops_init_le(bd, &gfs2_revoke_lops);
+ lops_add(sdp, bd);
}
void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
@@ -171,9 +177,9 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
unsigned int n = len;
gfs2_log_lock(sdp);
- list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) {
+ list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_list) {
if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) {
- list_del_init(&bd->bd_le.le_list);
+ list_del_init(&bd->bd_list);
gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
sdp->sd_log_num_revoke--;
kmem_cache_free(gfs2_bufdata_cachep, bd);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9e7765e8e7b..f00d7c5744f 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,7 +25,8 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly;
struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
struct kmem_cache *gfs2_quotad_cachep __read_mostly;
-mempool_t *gfs2_bh_pool __read_mostly;
+struct kmem_cache *gfs2_rsrv_cachep __read_mostly;
+mempool_t *gfs2_page_pool __read_mostly;
void gfs2_assert_i(struct gfs2_sbd *sdp)
{
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index a4ce76c67db..3586b0dd6aa 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -152,7 +152,8 @@ extern struct kmem_cache *gfs2_inode_cachep;
extern struct kmem_cache *gfs2_bufdata_cachep;
extern struct kmem_cache *gfs2_rgrpd_cachep;
extern struct kmem_cache *gfs2_quotad_cachep;
-extern mempool_t *gfs2_bh_pool;
+extern struct kmem_cache *gfs2_rsrv_cachep;
+extern mempool_t *gfs2_page_pool;
static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
unsigned int *p)
diff --git a/fs/inode.c b/fs/inode.c
index 9f4f5fecc09..da93f7d160d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -135,8 +135,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fop = &empty_fops;
inode->__i_nlink = 1;
inode->i_opflags = 0;
- inode->i_uid = 0;
- inode->i_gid = 0;
+ i_uid_write(inode, 0);
+ i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0);
inode->i_size = 0;
inode->i_blocks = 0;
@@ -1647,6 +1647,7 @@ void __init inode_init_early(void)
HASH_EARLY,
&i_hash_shift,
&i_hash_mask,
+ 0,
0);
for (loop = 0; loop < (1U << i_hash_shift); loop++)
@@ -1677,6 +1678,7 @@ void __init inode_init(void)
0,
&i_hash_shift,
&i_hash_mask,
+ 0,
0);
for (loop = 0; loop < (1U << i_hash_shift); loop++)
@@ -1732,11 +1734,9 @@ EXPORT_SYMBOL(inode_init_owner);
*/
bool inode_owner_or_capable(const struct inode *inode)
{
- struct user_namespace *ns = inode_userns(inode);
-
- if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
+ if (uid_eq(current_fsuid(), inode->i_uid))
return true;
- if (ns_capable(ns, CAP_FOWNER))
+ if (inode_capable(inode, CAP_FOWNER))
return true;
return false;
}
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 0f1b9515213..5e6dbe8958f 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -37,8 +37,8 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
rcu_read_lock();
tcred = __task_cred(task);
- if (tcred->uid != cred->euid &&
- tcred->uid != cred->uid && !capable(CAP_SYS_NICE)) {
+ if (!uid_eq(tcred->uid, cred->euid) &&
+ !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
rcu_read_unlock();
return -EPERM;
}
@@ -65,6 +65,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
struct task_struct *p, *g;
struct user_struct *user;
struct pid *pgrp;
+ kuid_t uid;
int ret;
switch (class) {
@@ -110,16 +111,19 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
+ uid = make_kuid(current_user_ns(), who);
+ if (!uid_valid(uid))
+ break;
if (!who)
user = current_user();
else
- user = find_user(who);
+ user = find_user(uid);
if (!user)
break;
do_each_thread(g, p) {
- if (__task_cred(p)->uid != who)
+ if (!uid_eq(task_uid(p), uid))
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
@@ -174,6 +178,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
struct task_struct *g, *p;
struct user_struct *user;
struct pid *pgrp;
+ kuid_t uid;
int ret = -ESRCH;
int tmpio;
@@ -203,16 +208,17 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
+ uid = make_kuid(current_user_ns(), who);
if (!who)
user = current_user();
else
- user = find_user(who);
+ user = find_user(uid);
if (!user)
break;
do_each_thread(g, p) {
- if (__task_cred(p)->uid != user->uid)
+ if (!uid_eq(task_uid(p), user->uid))
continue;
tmpio = get_task_ioprio(p);
if (tmpio < 0)
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 05f0754f2b4..08c03044abd 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -508,20 +508,19 @@ int cleanup_journal_tail(journal_t *journal)
/*
* We need to make sure that any blocks that were recently written out
* --- perhaps by log_do_checkpoint() --- are flushed out before we
- * drop the transactions from the journal. It's unlikely this will be
- * necessary, especially with an appropriately sized journal, but we
- * need this to guarantee correctness. Fortunately
- * cleanup_journal_tail() doesn't get called all that often.
+ * drop the transactions from the journal. Similarly we need to be sure
+ * superblock makes it to disk before next transaction starts reusing
+ * freed space (otherwise we could replay some blocks of the new
+ * transaction thinking they belong to the old one). So we use
+ * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially
+ * with an appropriately sized journal, but we need this to guarantee
+ * correctness. Fortunately cleanup_journal_tail() doesn't get called
+ * all that often.
*/
- if (journal->j_flags & JFS_BARRIER)
- blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ journal_update_sb_log_tail(journal, first_tid, blocknr,
+ WRITE_FLUSH_FUA);
spin_lock(&journal->j_state_lock);
- if (!tid_gt(first_tid, journal->j_tail_sequence)) {
- spin_unlock(&journal->j_state_lock);
- /* Someone else cleaned up journal so return 0 */
- return 0;
- }
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
@@ -539,8 +538,6 @@ int cleanup_journal_tail(journal_t *journal)
journal->j_tail_sequence = first_tid;
journal->j_tail = blocknr;
spin_unlock(&journal->j_state_lock);
- if (!(journal->j_flags & JFS_ABORT))
- journal_update_superblock(journal, 1);
return 0;
}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f2b9a571f4c..52c15c77602 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -298,6 +298,7 @@ void journal_commit_transaction(journal_t *journal)
int tag_flag;
int i;
struct blk_plug plug;
+ int write_op = WRITE;
/*
* First job: lock down the current transaction and wait for
@@ -307,7 +308,16 @@ void journal_commit_transaction(journal_t *journal)
/* Do we need to erase the effects of a prior journal_flush? */
if (journal->j_flags & JFS_FLUSHED) {
jbd_debug(3, "super block updated\n");
- journal_update_superblock(journal, 1);
+ mutex_lock(&journal->j_checkpoint_mutex);
+ /*
+ * We hold j_checkpoint_mutex so tail cannot change under us.
+ * We don't need any special data guarantees for writing sb
+ * since journal is empty and it is ok for write to be
+ * flushed only with transaction commit.
+ */
+ journal_update_sb_log_tail(journal, journal->j_tail_sequence,
+ journal->j_tail, WRITE_SYNC);
+ mutex_unlock(&journal->j_checkpoint_mutex);
} else {
jbd_debug(3, "superblock not updated\n");
}
@@ -413,13 +423,16 @@ void journal_commit_transaction(journal_t *journal)
jbd_debug (3, "JBD: commit phase 2\n");
+ if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
+ write_op = WRITE_SYNC;
+
/*
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
blk_start_plug(&plug);
err = journal_submit_data_buffers(journal, commit_transaction,
- WRITE_SYNC);
+ write_op);
blk_finish_plug(&plug);
/*
@@ -478,7 +491,7 @@ void journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
- journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
+ journal_write_revoke_records(journal, commit_transaction, write_op);
/*
* If we found any dirty or locked buffers, then we should have
@@ -649,7 +662,7 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(WRITE_SYNC, bh);
+ submit_bh(write_op, bh);
}
cond_resched();
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 0971e921780..425c2f2cf17 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -563,6 +563,8 @@ int log_wait_commit(journal_t *journal, tid_t tid)
spin_unlock(&journal->j_state_lock);
#endif
spin_lock(&journal->j_state_lock);
+ if (!tid_geq(journal->j_commit_waited, tid))
+ journal->j_commit_waited = tid;
while (tid_gt(tid, journal->j_commit_sequence)) {
jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
tid, journal->j_commit_sequence);
@@ -921,8 +923,33 @@ static int journal_reset(journal_t *journal)
journal->j_max_transaction_buffers = journal->j_maxlen / 4;
- /* Add the dynamic fields and write it to disk. */
- journal_update_superblock(journal, 1);
+ /*
+ * As a special case, if the on-disk copy is already marked as needing
+ * no recovery (s_start == 0), then we can safely defer the superblock
+ * update until the next commit by setting JFS_FLUSHED. This avoids
+ * attempting a write to a potential-readonly device.
+ */
+ if (sb->s_start == 0) {
+ jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+ "(start %u, seq %d, errno %d)\n",
+ journal->j_tail, journal->j_tail_sequence,
+ journal->j_errno);
+ journal->j_flags |= JFS_FLUSHED;
+ } else {
+ /* Lock here to make assertions happy... */
+ mutex_lock(&journal->j_checkpoint_mutex);
+ /*
+ * Update log tail information. We use WRITE_FUA since new
+ * transaction will start reusing journal space and so we
+ * must make sure information about current log tail is on
+ * disk before that.
+ */
+ journal_update_sb_log_tail(journal,
+ journal->j_tail_sequence,
+ journal->j_tail,
+ WRITE_FUA);
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ }
return journal_start_thread(journal);
}
@@ -999,35 +1026,15 @@ int journal_create(journal_t *journal)
return journal_reset(journal);
}
-/**
- * void journal_update_superblock() - Update journal sb on disk.
- * @journal: The journal to update.
- * @wait: Set to '0' if you don't want to wait for IO completion.
- *
- * Update a journal's dynamic superblock fields and write it to disk,
- * optionally waiting for the IO to complete.
- */
-void journal_update_superblock(journal_t *journal, int wait)
+static void journal_write_superblock(journal_t *journal, int write_op)
{
- journal_superblock_t *sb = journal->j_superblock;
struct buffer_head *bh = journal->j_sb_buffer;
+ int ret;
- /*
- * As a special case, if the on-disk copy is already marked as needing
- * no recovery (s_start == 0) and there are no outstanding transactions
- * in the filesystem, then we can safely defer the superblock update
- * until the next commit by setting JFS_FLUSHED. This avoids
- * attempting a write to a potential-readonly device.
- */
- if (sb->s_start == 0 && journal->j_tail_sequence ==
- journal->j_transaction_sequence) {
- jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
- "(start %u, seq %d, errno %d)\n",
- journal->j_tail, journal->j_tail_sequence,
- journal->j_errno);
- goto out;
- }
-
+ trace_journal_write_superblock(journal, write_op);
+ if (!(journal->j_flags & JFS_BARRIER))
+ write_op &= ~(REQ_FUA | REQ_FLUSH);
+ lock_buffer(bh);
if (buffer_write_io_error(bh)) {
char b[BDEVNAME_SIZE];
/*
@@ -1045,42 +1052,100 @@ void journal_update_superblock(journal_t *journal, int wait)
set_buffer_uptodate(bh);
}
+ get_bh(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ ret = submit_bh(write_op, bh);
+ wait_on_buffer(bh);
+ if (buffer_write_io_error(bh)) {
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ ret = -EIO;
+ }
+ if (ret) {
+ char b[BDEVNAME_SIZE];
+ printk(KERN_ERR "JBD: Error %d detected "
+ "when updating journal superblock for %s.\n",
+ ret, journal_dev_name(journal, b));
+ }
+}
+
+/**
+ * journal_update_sb_log_tail() - Update log tail in journal sb on disk.
+ * @journal: The journal to update.
+ * @tail_tid: TID of the new transaction at the tail of the log
+ * @tail_block: The first block of the transaction at the tail of the log
+ * @write_op: With which operation should we write the journal sb
+ *
+ * Update a journal's superblock information about log tail and write it to
+ * disk, waiting for the IO to complete.
+ */
+void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+ unsigned int tail_block, int write_op)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+
+ BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+ jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n",
+ tail_block, tail_tid);
+
+ sb->s_sequence = cpu_to_be32(tail_tid);
+ sb->s_start = cpu_to_be32(tail_block);
+
+ journal_write_superblock(journal, write_op);
+
+ /* Log is no longer empty */
+ spin_lock(&journal->j_state_lock);
+ WARN_ON(!sb->s_sequence);
+ journal->j_flags &= ~JFS_FLUSHED;
+ spin_unlock(&journal->j_state_lock);
+}
+
+/**
+ * mark_journal_empty() - Mark on disk journal as empty.
+ * @journal: The journal to update.
+ *
+ * Update a journal's dynamic superblock fields to show that journal is empty.
+ * Write updated superblock to disk waiting for IO to complete.
+ */
+static void mark_journal_empty(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+
+ BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
spin_lock(&journal->j_state_lock);
- jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
- journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+ jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
+ journal->j_tail_sequence);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
- sb->s_start = cpu_to_be32(journal->j_tail);
- sb->s_errno = cpu_to_be32(journal->j_errno);
+ sb->s_start = cpu_to_be32(0);
spin_unlock(&journal->j_state_lock);
- BUFFER_TRACE(bh, "marking dirty");
- mark_buffer_dirty(bh);
- if (wait) {
- sync_dirty_buffer(bh);
- if (buffer_write_io_error(bh)) {
- char b[BDEVNAME_SIZE];
- printk(KERN_ERR "JBD: I/O error detected "
- "when updating journal superblock for %s.\n",
- journal_dev_name(journal, b));
- clear_buffer_write_io_error(bh);
- set_buffer_uptodate(bh);
- }
- } else
- write_dirty_buffer(bh, WRITE);
+ journal_write_superblock(journal, WRITE_FUA);
- trace_jbd_update_superblock_end(journal, wait);
-out:
- /* If we have just flushed the log (by marking s_start==0), then
- * any future commit will have to be careful to update the
- * superblock again to re-record the true start of the log. */
+ spin_lock(&journal->j_state_lock);
+ /* Log is empty */
+ journal->j_flags |= JFS_FLUSHED;
+ spin_unlock(&journal->j_state_lock);
+}
+
+/**
+ * journal_update_sb_errno() - Update error in the journal.
+ * @journal: The journal to update.
+ *
+ * Update a journal's errno. Write updated superblock to disk waiting for IO
+ * to complete.
+ */
+static void journal_update_sb_errno(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
spin_lock(&journal->j_state_lock);
- if (sb->s_start)
- journal->j_flags &= ~JFS_FLUSHED;
- else
- journal->j_flags |= JFS_FLUSHED;
+ jbd_debug(1, "JBD: updating superblock error (errno %d)\n",
+ journal->j_errno);
+ sb->s_errno = cpu_to_be32(journal->j_errno);
spin_unlock(&journal->j_state_lock);
+
+ journal_write_superblock(journal, WRITE_SYNC);
}
/*
@@ -1251,6 +1316,8 @@ int journal_destroy(journal_t *journal)
/* Force any old transactions to disk */
+ /* We cannot race with anybody but must keep assertions happy */
+ mutex_lock(&journal->j_checkpoint_mutex);
/* Totally anal locking here... */
spin_lock(&journal->j_list_lock);
while (journal->j_checkpoint_transactions != NULL) {
@@ -1266,16 +1333,14 @@ int journal_destroy(journal_t *journal)
if (journal->j_sb_buffer) {
if (!is_journal_aborted(journal)) {
- /* We can now mark the journal as empty. */
- journal->j_tail = 0;
journal->j_tail_sequence =
++journal->j_transaction_sequence;
- journal_update_superblock(journal, 1);
- } else {
+ mark_journal_empty(journal);
+ } else
err = -EIO;
- }
brelse(journal->j_sb_buffer);
}
+ mutex_unlock(&journal->j_checkpoint_mutex);
if (journal->j_inode)
iput(journal->j_inode);
@@ -1455,7 +1520,6 @@ int journal_flush(journal_t *journal)
{
int err = 0;
transaction_t *transaction = NULL;
- unsigned int old_tail;
spin_lock(&journal->j_state_lock);
@@ -1490,6 +1554,7 @@ int journal_flush(journal_t *journal)
if (is_journal_aborted(journal))
return -EIO;
+ mutex_lock(&journal->j_checkpoint_mutex);
cleanup_journal_tail(journal);
/* Finally, mark the journal as really needing no recovery.
@@ -1497,14 +1562,9 @@ int journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
+ mark_journal_empty(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
spin_lock(&journal->j_state_lock);
- old_tail = journal->j_tail;
- journal->j_tail = 0;
- spin_unlock(&journal->j_state_lock);
- journal_update_superblock(journal, 1);
- spin_lock(&journal->j_state_lock);
- journal->j_tail = old_tail;
-
J_ASSERT(!journal->j_running_transaction);
J_ASSERT(!journal->j_committing_transaction);
J_ASSERT(!journal->j_checkpoint_transactions);
@@ -1544,8 +1604,12 @@ int journal_wipe(journal_t *journal, int write)
write ? "Clearing" : "Ignoring");
err = journal_skip_recovery(journal);
- if (write)
- journal_update_superblock(journal, 1);
+ if (write) {
+ /* Lock to make assertions happy... */
+ mutex_lock(&journal->j_checkpoint_mutex);
+ mark_journal_empty(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ }
no_recovery:
return err;
@@ -1613,7 +1677,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
__journal_abort_hard(journal);
if (errno)
- journal_update_superblock(journal, 1);
+ journal_update_sb_errno(journal);
}
/**
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index b2a7e5244e3..febc10db5ce 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1433,8 +1433,6 @@ int journal_stop(handle_t *handle)
}
}
- if (handle->h_sync)
- transaction->t_synchronous_commit = 1;
current->journal_info = NULL;
spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock);
diff --git a/fs/libfs.c b/fs/libfs.c
index 18d08f5db53..f86ec27a423 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -68,7 +68,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
int dcache_dir_open(struct inode *inode, struct file *file)
{
- static struct qstr cursor_name = {.len = 1, .name = "."};
+ static struct qstr cursor_name = QSTR_INIT(".", 1);
file->private_data = d_alloc(file->f_path.dentry, &cursor_name);
@@ -225,7 +225,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
struct dentry *dentry;
struct inode *root;
- struct qstr d_name = {.name = name, .len = strlen(name)};
+ struct qstr d_name = QSTR_INIT(name, strlen(name));
if (IS_ERR(s))
return ERR_CAST(s);
diff --git a/fs/locks.c b/fs/locks.c
index 0d68f1f8179..4f441e46cef 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1446,7 +1446,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
struct inode *inode = dentry->d_inode;
int error;
- if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
+ if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
return -EACCES;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
diff --git a/fs/namei.c b/fs/namei.c
index c42791914f8..c651f02c9fe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -16,6 +16,7 @@
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
@@ -116,47 +117,37 @@
* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
* PATH_MAX includes the nul terminator --RR.
*/
-static int do_getname(const char __user *filename, char *page)
-{
- int retval;
- unsigned long len = PATH_MAX;
-
- if (!segment_eq(get_fs(), KERNEL_DS)) {
- if ((unsigned long) filename >= TASK_SIZE)
- return -EFAULT;
- if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
- len = TASK_SIZE - (unsigned long) filename;
- }
-
- retval = strncpy_from_user(page, filename, len);
- if (retval > 0) {
- if (retval < len)
- return 0;
- return -ENAMETOOLONG;
- } else if (!retval)
- retval = -ENOENT;
- return retval;
-}
-
static char *getname_flags(const char __user *filename, int flags, int *empty)
{
- char *result = __getname();
- int retval;
+ char *result = __getname(), *err;
+ int len;
- if (!result)
+ if (unlikely(!result))
return ERR_PTR(-ENOMEM);
- retval = do_getname(filename, result);
- if (retval < 0) {
- if (retval == -ENOENT && empty)
+ len = strncpy_from_user(result, filename, PATH_MAX);
+ err = ERR_PTR(len);
+ if (unlikely(len < 0))
+ goto error;
+
+ /* The empty path is special. */
+ if (unlikely(!len)) {
+ if (empty)
*empty = 1;
- if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
- __putname(result);
- return ERR_PTR(retval);
- }
+ err = ERR_PTR(-ENOENT);
+ if (!(flags & LOOKUP_EMPTY))
+ goto error;
+ }
+
+ err = ERR_PTR(-ENAMETOOLONG);
+ if (likely(len < PATH_MAX)) {
+ audit_getname(result);
+ return result;
}
- audit_getname(result);
- return result;
+
+error:
+ __putname(result);
+ return err;
}
char *getname(const char __user * filename)
@@ -228,10 +219,7 @@ static int acl_permission_check(struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
- if (current_user_ns() != inode_userns(inode))
- goto other_perms;
-
- if (likely(current_fsuid() == inode->i_uid))
+ if (likely(uid_eq(current_fsuid(), inode->i_uid)))
mode >>= 6;
else {
if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
@@ -244,7 +232,6 @@ static int acl_permission_check(struct inode *inode, int mask)
mode >>= 3;
}
-other_perms:
/*
* If the DACs are ok we don't need any capability check.
*/
@@ -280,10 +267,10 @@ int generic_permission(struct inode *inode, int mask)
if (S_ISDIR(inode->i_mode)) {
/* DACs are overridable for directories */
- if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
+ if (inode_capable(inode, CAP_DAC_OVERRIDE))
return 0;
if (!(mask & MAY_WRITE))
- if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
+ if (inode_capable(inode, CAP_DAC_READ_SEARCH))
return 0;
return -EACCES;
}
@@ -293,7 +280,7 @@ int generic_permission(struct inode *inode, int mask)
* at least one exec bit set.
*/
if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
- if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
+ if (inode_capable(inode, CAP_DAC_OVERRIDE))
return 0;
/*
@@ -301,7 +288,7 @@ int generic_permission(struct inode *inode, int mask)
*/
mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
if (mask == MAY_READ)
- if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
+ if (inode_capable(inode, CAP_DAC_READ_SEARCH))
return 0;
return -EACCES;
@@ -1154,12 +1141,25 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
- *inode = nd->inode;
- dentry = __d_lookup_rcu(parent, name, &seq, inode);
+ dentry = __d_lookup_rcu(parent, name, &seq, nd->inode);
if (!dentry)
goto unlazy;
- /* Memory barrier in read_seqcount_begin of child is enough */
+ /*
+ * This sequence count validates that the inode matches
+ * the dentry name information from lookup.
+ */
+ *inode = dentry->d_inode;
+ if (read_seqcount_retry(&dentry->d_seq, seq))
+ return -ECHILD;
+
+ /*
+ * This sequence count validates that the parent had no
+ * changes while we did the lookup of the dentry above.
+ *
+ * The memory barrier in read_seqcount_begin of child is
+ * enough, we can use __read_seqcount_retry here.
+ */
if (__read_seqcount_retry(&parent->d_seq, nd->seq))
return -ECHILD;
nd->seq = seq;
@@ -1452,7 +1452,8 @@ EXPORT_SYMBOL(full_name_hash);
*/
static inline unsigned long hash_name(const char *name, unsigned int *hashp)
{
- unsigned long a, mask, hash, len;
+ unsigned long a, b, adata, bdata, mask, hash, len;
+ const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
hash = a = 0;
len = -sizeof(unsigned long);
@@ -1460,17 +1461,18 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
hash = (hash + a) * 9;
len += sizeof(unsigned long);
a = load_unaligned_zeropad(name+len);
- /* Do we have any NUL or '/' bytes in this word? */
- mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/'));
- } while (!mask);
-
- /* The mask *below* the first high bit set */
- mask = (mask - 1) & ~mask;
- mask >>= 7;
- hash += a & mask;
+ b = a ^ REPEAT_BYTE('/');
+ } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
+
+ adata = prep_zero_mask(a, adata, &constants);
+ bdata = prep_zero_mask(b, bdata, &constants);
+
+ mask = create_zero_mask(adata | bdata);
+
+ hash += a & zero_bytemask(mask);
*hashp = fold_hash(hash);
- return len + count_masked_bytes(mask);
+ return len + find_zero(mask);
}
#else
@@ -1931,19 +1933,15 @@ static int user_path_parent(int dfd, const char __user *path,
*/
static inline int check_sticky(struct inode *dir, struct inode *inode)
{
- uid_t fsuid = current_fsuid();
+ kuid_t fsuid = current_fsuid();
if (!(dir->i_mode & S_ISVTX))
return 0;
- if (current_user_ns() != inode_userns(inode))
- goto other_userns;
- if (inode->i_uid == fsuid)
+ if (uid_eq(inode->i_uid, fsuid))
return 0;
- if (dir->i_uid == fsuid)
+ if (uid_eq(dir->i_uid, fsuid))
return 0;
-
-other_userns:
- return !ns_capable(inode_userns(inode), CAP_FOWNER);
+ return !inode_capable(inode, CAP_FOWNER);
}
/*
@@ -2531,8 +2529,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
if (error)
return error;
- if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
- !ns_capable(inode_userns(dir), CAP_MKNOD))
+ if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
return -EPERM;
if (!dir->i_op->mknod)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8789210c690..eedd24d0ad2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -477,10 +477,7 @@ different:
static
void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
{
- struct qstr filename = {
- .len = entry->len,
- .name = entry->name,
- };
+ struct qstr filename = QSTR_INIT(entry->name, entry->len);
struct dentry *dentry;
struct dentry *alias;
struct inode *dir = parent->d_inode;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 5242eae6711..75c68299358 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -398,8 +398,7 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
{
struct nfs_removeargs arg = {
.fh = NFS_FH(dir),
- .name.len = name->len,
- .name.name = name->name,
+ .name = *name,
};
struct nfs_removeres res;
struct rpc_message msg = {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 99650aaf893..ab985f6f0da 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2782,8 +2782,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_removeargs args = {
.fh = NFS_FH(dir),
- .name.len = name->len,
- .name.name = name->name,
+ .name = *name,
.bitmask = server->attr_bitmask,
};
struct nfs_removeres res = {
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b63b6f4d14f..d6408b6437d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -335,8 +335,7 @@ nfs_proc_remove(struct inode *dir, struct qstr *name)
{
struct nfs_removeargs arg = {
.fh = NFS_FH(dir),
- .name.len = name->len,
- .name.name = name->name,
+ .name = *name,
};
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_REMOVE],
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 79717a40dab..204438cc914 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -1,6 +1,7 @@
/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
#include <linux/sched.h>
+#include <linux/user_namespace.h>
#include "nfsd.h"
#include "auth.h"
@@ -56,8 +57,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
goto oom;
for (i = 0; i < rqgi->ngroups; i++) {
- if (!GROUP_AT(rqgi, i))
- GROUP_AT(gi, i) = exp->ex_anon_gid;
+ if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i)))
+ GROUP_AT(gi, i) = make_kgid(&init_user_ns, exp->ex_anon_gid);
else
GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
}
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index fce2bbee66d..0bb2c2010b9 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -441,7 +441,7 @@ static struct dentry *nilfs_get_parent(struct dentry *child)
{
unsigned long ino;
struct inode *inode;
- struct qstr dotdot = {.name = "..", .len = 2};
+ struct qstr dotdot = QSTR_INIT("..", 2);
struct nilfs_root *root;
ino = nilfs_inode_by_name(child->d_inode, &dotdot);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 044e7b58d31..1bfe8802cc1 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2005,7 +2005,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
o2net_listen_sock = sock;
INIT_WORK(&o2net_listen_work, o2net_accept_many);
- sock->sk->sk_reuse = 1;
+ sock->sk->sk_reuse = SK_CAN_REUSE;
ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
if (ret < 0) {
printk(KERN_ERR "o2net: Error %d while binding socket at "
diff --git a/fs/open.c b/fs/open.c
index 5720854156d..d54301219d0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -316,7 +316,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
if (!issecure(SECURE_NO_SETUID_FIXUP)) {
/* Clear the capabilities if we switch to a non-root user */
- if (override_cred->uid)
+ kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
+ if (!uid_eq(override_cred->uid, root_uid))
cap_clear(override_cred->cap_effective);
else
override_cred->cap_effective =
@@ -505,15 +506,24 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
struct inode *inode = path->dentry->d_inode;
int error;
struct iattr newattrs;
+ kuid_t uid;
+ kgid_t gid;
+
+ uid = make_kuid(current_user_ns(), user);
+ gid = make_kgid(current_user_ns(), group);
newattrs.ia_valid = ATTR_CTIME;
if (user != (uid_t) -1) {
+ if (!uid_valid(uid))
+ return -EINVAL;
newattrs.ia_valid |= ATTR_UID;
- newattrs.ia_uid = user;
+ newattrs.ia_uid = uid;
}
if (group != (gid_t) -1) {
+ if (!gid_valid(gid))
+ return -EINVAL;
newattrs.ia_valid |= ATTR_GID;
- newattrs.ia_gid = group;
+ newattrs.ia_gid = gid;
}
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
@@ -681,7 +691,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
f->f_op = fops_get(inode->i_fop);
- error = security_dentry_open(f, cred);
+ error = security_file_open(f, cred);
if (error)
goto cleanup_all;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index f9bd395b347..dc4c5a7b9ec 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,6 +81,7 @@
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <linux/user_namespace.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -161,6 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk)
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
+ struct user_namespace *user_ns = current_user_ns();
struct group_info *group_info;
int g;
struct fdtable *fdt = NULL;
@@ -189,8 +191,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
task_tgid_nr_ns(p, ns),
pid_nr_ns(pid, ns),
ppid, tpid,
- cred->uid, cred->euid, cred->suid, cred->fsuid,
- cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ from_kuid_munged(user_ns, cred->uid),
+ from_kuid_munged(user_ns, cred->euid),
+ from_kuid_munged(user_ns, cred->suid),
+ from_kuid_munged(user_ns, cred->fsuid),
+ from_kgid_munged(user_ns, cred->gid),
+ from_kgid_munged(user_ns, cred->egid),
+ from_kgid_munged(user_ns, cred->sgid),
+ from_kgid_munged(user_ns, cred->fsgid));
task_lock(p);
if (p->files)
@@ -205,7 +213,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
task_unlock(p);
for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
- seq_printf(m, "%d ", GROUP_AT(group_info, g));
+ seq_printf(m, "%d ",
+ from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
put_cred(cred);
seq_putc(m, '\n');
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 57b8159f26f..d2d3108a611 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
#include <linux/oom.h>
#include <linux/elf.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/flex_array.h>
@@ -1561,8 +1562,8 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
generic_fillattr(inode, stat);
rcu_read_lock();
- stat->uid = 0;
- stat->gid = 0;
+ stat->uid = GLOBAL_ROOT_UID;
+ stat->gid = GLOBAL_ROOT_GID;
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
if (!has_pid_permissions(pid, task, 2)) {
@@ -1622,8 +1623,8 @@ int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
inode->i_gid = cred->egid;
rcu_read_unlock();
} else {
- inode->i_uid = 0;
- inode->i_gid = 0;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
}
inode->i_mode &= ~(S_ISUID | S_ISGID);
security_task_to_inode(task, inode);
@@ -1815,8 +1816,8 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
inode->i_gid = cred->egid;
rcu_read_unlock();
} else {
- inode->i_uid = 0;
- inode->i_gid = 0;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
}
i_mode = S_IFLNK;
@@ -2045,8 +2046,8 @@ static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
inode->i_gid = cred->egid;
rcu_read_unlock();
} else {
- inode->i_uid = 0;
- inode->i_gid = 0;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
}
security_task_to_inode(task, inode);
status = 1;
@@ -2924,6 +2925,74 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
}
#endif /* CONFIG_TASK_IO_ACCOUNTING */
+#ifdef CONFIG_USER_NS
+static int proc_id_map_open(struct inode *inode, struct file *file,
+ struct seq_operations *seq_ops)
+{
+ struct user_namespace *ns = NULL;
+ struct task_struct *task;
+ struct seq_file *seq;
+ int ret = -EINVAL;
+
+ task = get_proc_task(inode);
+ if (task) {
+ rcu_read_lock();
+ ns = get_user_ns(task_cred_xxx(task, user_ns));
+ rcu_read_unlock();
+ put_task_struct(task);
+ }
+ if (!ns)
+ goto err;
+
+ ret = seq_open(file, seq_ops);
+ if (ret)
+ goto err_put_ns;
+
+ seq = file->private_data;
+ seq->private = ns;
+
+ return 0;
+err_put_ns:
+ put_user_ns(ns);
+err:
+ return ret;
+}
+
+static int proc_id_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ put_user_ns(ns);
+ return seq_release(inode, file);
+}
+
+static int proc_uid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_uid_seq_operations);
+}
+
+static int proc_gid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_gid_seq_operations);
+}
+
+static const struct file_operations proc_uid_map_operations = {
+ .open = proc_uid_map_open,
+ .write = proc_uid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+
+static const struct file_operations proc_gid_map_operations = {
+ .open = proc_gid_map_open,
+ .write = proc_gid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+#endif /* CONFIG_USER_NS */
+
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -3026,6 +3095,10 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_HARDWALL
INF("hardwall", S_IRUGO, proc_pid_hardwall),
#endif
+#ifdef CONFIG_USER_NS
+ REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
+#endif
};
static int proc_tgid_base_readdir(struct file * filp,
@@ -3381,6 +3454,10 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_HARDWALL
INF("hardwall", S_IRUGO, proc_pid_hardwall),
#endif
+#ifdef CONFIG_USER_NS
+ REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
+#endif
};
static int proc_tid_base_readdir(struct file * filp,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 205c9228083..554ecc54799 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -108,8 +108,8 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
struct super_block *sb = root->d_sb;
struct pid_namespace *pid = sb->s_fs_info;
- if (pid->pid_gid)
- seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid);
+ if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
if (pid->hide_pid != 0)
seq_printf(seq, ",hidepid=%u", pid->hide_pid);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 21d836f4029..3476bca8f7a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -371,9 +371,9 @@ void register_sysctl_root(struct ctl_table_root *root)
static int test_perm(int mode, int op)
{
- if (!current_euid())
+ if (uid_eq(current_euid(), GLOBAL_ROOT_UID))
mode >>= 6;
- else if (in_egroup_p(0))
+ else if (in_egroup_p(GLOBAL_ROOT_GID))
mode >>= 3;
if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
return 0;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index eed44bfc85d..7c30fce037c 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -67,7 +67,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
case Opt_gid:
if (match_int(&args[0], &option))
return 0;
- pid->pid_gid = option;
+ pid->pid_gid = make_kgid(current_user_ns(), option);
break;
case Opt_hidepid:
if (match_int(&args[0], &option))
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 8007ae7c0d8..23ade2680a4 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -11,3 +11,20 @@ config PSTORE
(e.g. ACPI_APEI on X86) which will select this for you.
If you don't have a platform persistent store driver,
say N.
+
+config PSTORE_RAM
+ tristate "Log panic/oops to a RAM buffer"
+ depends on PSTORE
+ depends on HAS_IOMEM
+ depends on HAVE_MEMBLOCK
+ select REED_SOLOMON
+ select REED_SOLOMON_ENC8
+ select REED_SOLOMON_DEC8
+ help
+ This enables panic and oops messages to be logged to a circular
+ buffer in RAM where it can be read back at some later point.
+
+ Note that for historical reasons, the module will be named
+ "ramoops.ko".
+
+ For more information, see Documentation/ramoops.txt.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 760f4bce7d1..278a44e0d4e 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -5,3 +5,6 @@
obj-y += pstore.o
pstore-objs += inode.o platform.o
+
+ramoops-objs += ram.o ram_core.o
+obj-$(CONFIG_PSTORE_RAM) += ramoops.o
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
new file mode 100644
index 00000000000..9123cce28c1
--- /dev/null
+++ b/fs/pstore/ram.c
@@ -0,0 +1,383 @@
+/*
+ * RAM Oops/Panic logger
+ *
+ * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com>
+ * Copyright (C) 2011 Kees Cook <keescook@chromium.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+#include <linux/time.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/pstore_ram.h>
+
+#define RAMOOPS_KERNMSG_HDR "===="
+#define MIN_MEM_SIZE 4096UL
+
+static ulong record_size = MIN_MEM_SIZE;
+module_param(record_size, ulong, 0400);
+MODULE_PARM_DESC(record_size,
+ "size of each dump done on oops/panic");
+
+static ulong mem_address;
+module_param(mem_address, ulong, 0400);
+MODULE_PARM_DESC(mem_address,
+ "start of reserved RAM used to store oops/panic logs");
+
+static ulong mem_size;
+module_param(mem_size, ulong, 0400);
+MODULE_PARM_DESC(mem_size,
+ "size of reserved RAM used to store oops/panic logs");
+
+static int dump_oops = 1;
+module_param(dump_oops, int, 0600);
+MODULE_PARM_DESC(dump_oops,
+ "set to 1 to dump oopses, 0 to only dump panics (default 1)");
+
+static int ramoops_ecc;
+module_param_named(ecc, ramoops_ecc, int, 0600);
+MODULE_PARM_DESC(ramoops_ecc,
+ "set to 1 to enable ECC support");
+
+struct ramoops_context {
+ struct persistent_ram_zone **przs;
+ phys_addr_t phys_addr;
+ unsigned long size;
+ size_t record_size;
+ int dump_oops;
+ bool ecc;
+ unsigned int count;
+ unsigned int max_count;
+ unsigned int read_count;
+ struct pstore_info pstore;
+};
+
+static struct platform_device *dummy;
+static struct ramoops_platform_data *dummy_data;
+
+static int ramoops_pstore_open(struct pstore_info *psi)
+{
+ struct ramoops_context *cxt = psi->data;
+
+ cxt->read_count = 0;
+ return 0;
+}
+
+static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
+ struct timespec *time,
+ char **buf,
+ struct pstore_info *psi)
+{
+ ssize_t size;
+ struct ramoops_context *cxt = psi->data;
+ struct persistent_ram_zone *prz;
+
+ if (cxt->read_count >= cxt->max_count)
+ return -EINVAL;
+
+ *id = cxt->read_count++;
+ prz = cxt->przs[*id];
+
+ /* Only supports dmesg output so far. */
+ *type = PSTORE_TYPE_DMESG;
+ /* TODO(kees): Bogus time for the moment. */
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+
+ size = persistent_ram_old_size(prz);
+ *buf = kmalloc(size, GFP_KERNEL);
+ if (*buf == NULL)
+ return -ENOMEM;
+ memcpy(*buf, persistent_ram_old(prz), size);
+
+ return size;
+}
+
+static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
+{
+ char *hdr;
+ struct timeval timestamp;
+ size_t len;
+
+ do_gettimeofday(&timestamp);
+ hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n",
+ (long)timestamp.tv_sec, (long)timestamp.tv_usec);
+ WARN_ON_ONCE(!hdr);
+ len = hdr ? strlen(hdr) : 0;
+ persistent_ram_write(prz, hdr, len);
+ kfree(hdr);
+
+ return len;
+}
+
+static int ramoops_pstore_write(enum pstore_type_id type,
+ enum kmsg_dump_reason reason,
+ u64 *id,
+ unsigned int part,
+ size_t size, struct pstore_info *psi)
+{
+ struct ramoops_context *cxt = psi->data;
+ struct persistent_ram_zone *prz = cxt->przs[cxt->count];
+ size_t hlen;
+
+ /* Currently ramoops is designed to only store dmesg dumps. */
+ if (type != PSTORE_TYPE_DMESG)
+ return -EINVAL;
+
+ /* Out of the various dmesg dump types, ramoops is currently designed
+ * to only store crash logs, rather than storing general kernel logs.
+ */
+ if (reason != KMSG_DUMP_OOPS &&
+ reason != KMSG_DUMP_PANIC)
+ return -EINVAL;
+
+ /* Skip Oopes when configured to do so. */
+ if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
+ return -EINVAL;
+
+ /* Explicitly only take the first part of any new crash.
+ * If our buffer is larger than kmsg_bytes, this can never happen,
+ * and if our buffer is smaller than kmsg_bytes, we don't want the
+ * report split across multiple records.
+ */
+ if (part != 1)
+ return -ENOSPC;
+
+ hlen = ramoops_write_kmsg_hdr(prz);
+ if (size + hlen > prz->buffer_size)
+ size = prz->buffer_size - hlen;
+ persistent_ram_write(prz, cxt->pstore.buf, size);
+
+ cxt->count = (cxt->count + 1) % cxt->max_count;
+
+ return 0;
+}
+
+static int ramoops_pstore_erase(enum pstore_type_id type, u64 id,
+ struct pstore_info *psi)
+{
+ struct ramoops_context *cxt = psi->data;
+
+ if (id >= cxt->max_count)
+ return -EINVAL;
+
+ persistent_ram_free_old(cxt->przs[id]);
+
+ return 0;
+}
+
+static struct ramoops_context oops_cxt = {
+ .pstore = {
+ .owner = THIS_MODULE,
+ .name = "ramoops",
+ .open = ramoops_pstore_open,
+ .read = ramoops_pstore_read,
+ .write = ramoops_pstore_write,
+ .erase = ramoops_pstore_erase,
+ },
+};
+
+static int __init ramoops_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct ramoops_platform_data *pdata = pdev->dev.platform_data;
+ struct ramoops_context *cxt = &oops_cxt;
+ int err = -EINVAL;
+ int i;
+
+ /* Only a single ramoops area allowed at a time, so fail extra
+ * probes.
+ */
+ if (cxt->max_count)
+ goto fail_out;
+
+ if (!pdata->mem_size || !pdata->record_size) {
+ pr_err("The memory size and the record size must be "
+ "non-zero\n");
+ goto fail_out;
+ }
+
+ pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
+ pdata->record_size = rounddown_pow_of_two(pdata->record_size);
+
+ /* Check for the minimum memory size */
+ if (pdata->mem_size < MIN_MEM_SIZE &&
+ pdata->record_size < MIN_MEM_SIZE) {
+ pr_err("memory size too small, minimum is %lu\n",
+ MIN_MEM_SIZE);
+ goto fail_out;
+ }
+
+ if (pdata->mem_size < pdata->record_size) {
+ pr_err("The memory size must be larger than the "
+ "records size\n");
+ goto fail_out;
+ }
+
+ cxt->max_count = pdata->mem_size / pdata->record_size;
+ cxt->count = 0;
+ cxt->size = pdata->mem_size;
+ cxt->phys_addr = pdata->mem_address;
+ cxt->record_size = pdata->record_size;
+ cxt->dump_oops = pdata->dump_oops;
+ cxt->ecc = pdata->ecc;
+
+ cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_count, GFP_KERNEL);
+ if (!cxt->przs) {
+ err = -ENOMEM;
+ dev_err(dev, "failed to initialize a prz array\n");
+ goto fail_out;
+ }
+
+ for (i = 0; i < cxt->max_count; i++) {
+ size_t sz = cxt->record_size;
+ phys_addr_t start = cxt->phys_addr + sz * i;
+
+ cxt->przs[i] = persistent_ram_new(start, sz, cxt->ecc);
+ if (IS_ERR(cxt->przs[i])) {
+ err = PTR_ERR(cxt->przs[i]);
+ dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
+ sz, (unsigned long long)start, err);
+ goto fail_przs;
+ }
+ }
+
+ cxt->pstore.data = cxt;
+ cxt->pstore.bufsize = cxt->przs[0]->buffer_size;
+ cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL);
+ spin_lock_init(&cxt->pstore.buf_lock);
+ if (!cxt->pstore.buf) {
+ pr_err("cannot allocate pstore buffer\n");
+ goto fail_clear;
+ }
+
+ err = pstore_register(&cxt->pstore);
+ if (err) {
+ pr_err("registering with pstore failed\n");
+ goto fail_buf;
+ }
+
+ /*
+ * Update the module parameter variables as well so they are visible
+ * through /sys/module/ramoops/parameters/
+ */
+ mem_size = pdata->mem_size;
+ mem_address = pdata->mem_address;
+ record_size = pdata->record_size;
+ dump_oops = pdata->dump_oops;
+
+ pr_info("attached 0x%lx@0x%llx (%ux0x%zx), ecc: %s\n",
+ cxt->size, (unsigned long long)cxt->phys_addr,
+ cxt->max_count, cxt->record_size,
+ ramoops_ecc ? "on" : "off");
+
+ return 0;
+
+fail_buf:
+ kfree(cxt->pstore.buf);
+fail_clear:
+ cxt->pstore.bufsize = 0;
+ cxt->max_count = 0;
+fail_przs:
+ for (i = 0; cxt->przs[i]; i++)
+ persistent_ram_free(cxt->przs[i]);
+ kfree(cxt->przs);
+fail_out:
+ return err;
+}
+
+static int __exit ramoops_remove(struct platform_device *pdev)
+{
+#if 0
+ /* TODO(kees): We cannot unload ramoops since pstore doesn't support
+ * unregistering yet.
+ */
+ struct ramoops_context *cxt = &oops_cxt;
+
+ iounmap(cxt->virt_addr);
+ release_mem_region(cxt->phys_addr, cxt->size);
+ cxt->max_count = 0;
+
+ /* TODO(kees): When pstore supports unregistering, call it here. */
+ kfree(cxt->pstore.buf);
+ cxt->pstore.bufsize = 0;
+
+ return 0;
+#endif
+ return -EBUSY;
+}
+
+static struct platform_driver ramoops_driver = {
+ .remove = __exit_p(ramoops_remove),
+ .driver = {
+ .name = "ramoops",
+ .owner = THIS_MODULE,
+ },
+};
+
+static int __init ramoops_init(void)
+{
+ int ret;
+ ret = platform_driver_probe(&ramoops_driver, ramoops_probe);
+ if (ret == -ENODEV) {
+ /*
+ * If we didn't find a platform device, we use module parameters
+ * building platform data on the fly.
+ */
+ pr_info("platform device not found, using module parameters\n");
+ dummy_data = kzalloc(sizeof(struct ramoops_platform_data),
+ GFP_KERNEL);
+ if (!dummy_data)
+ return -ENOMEM;
+ dummy_data->mem_size = mem_size;
+ dummy_data->mem_address = mem_address;
+ dummy_data->record_size = record_size;
+ dummy_data->dump_oops = dump_oops;
+ dummy_data->ecc = ramoops_ecc;
+ dummy = platform_create_bundle(&ramoops_driver, ramoops_probe,
+ NULL, 0, dummy_data,
+ sizeof(struct ramoops_platform_data));
+
+ if (IS_ERR(dummy))
+ ret = PTR_ERR(dummy);
+ else
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static void __exit ramoops_exit(void)
+{
+ platform_driver_unregister(&ramoops_driver);
+ kfree(dummy_data);
+}
+
+module_init(ramoops_init);
+module_exit(ramoops_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marco Stornelli <marco.stornelli@gmail.com>");
+MODULE_DESCRIPTION("RAM Oops/Panic logger/driver");
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
new file mode 100644
index 00000000000..31f8d184f3a
--- /dev/null
+++ b/fs/pstore/ram_core.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/rslib.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pstore_ram.h>
+#include <asm/page.h>
+
+struct persistent_ram_buffer {
+ uint32_t sig;
+ atomic_t start;
+ atomic_t size;
+ uint8_t data[0];
+};
+
+#define PERSISTENT_RAM_SIG (0x43474244) /* DBGC */
+
+static __initdata LIST_HEAD(persistent_ram_list);
+
+static inline size_t buffer_size(struct persistent_ram_zone *prz)
+{
+ return atomic_read(&prz->buffer->size);
+}
+
+static inline size_t buffer_start(struct persistent_ram_zone *prz)
+{
+ return atomic_read(&prz->buffer->start);
+}
+
+/* increase and wrap the start pointer, returning the old value */
+static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
+{
+ int old;
+ int new;
+
+ do {
+ old = atomic_read(&prz->buffer->start);
+ new = old + a;
+ while (unlikely(new > prz->buffer_size))
+ new -= prz->buffer_size;
+ } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old);
+
+ return old;
+}
+
+/* increase the size counter until it hits the max size */
+static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
+{
+ size_t old;
+ size_t new;
+
+ if (atomic_read(&prz->buffer->size) == prz->buffer_size)
+ return;
+
+ do {
+ old = atomic_read(&prz->buffer->size);
+ new = old + a;
+ if (new > prz->buffer_size)
+ new = prz->buffer_size;
+ } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
+}
+
+static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
+ uint8_t *data, size_t len, uint8_t *ecc)
+{
+ int i;
+ uint16_t par[prz->ecc_size];
+
+ /* Initialize the parity buffer */
+ memset(par, 0, sizeof(par));
+ encode_rs8(prz->rs_decoder, data, len, par, 0);
+ for (i = 0; i < prz->ecc_size; i++)
+ ecc[i] = par[i];
+}
+
+static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz,
+ void *data, size_t len, uint8_t *ecc)
+{
+ int i;
+ uint16_t par[prz->ecc_size];
+
+ for (i = 0; i < prz->ecc_size; i++)
+ par[i] = ecc[i];
+ return decode_rs8(prz->rs_decoder, data, par, len,
+ NULL, 0, NULL, 0, NULL);
+}
+
+static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz,
+ unsigned int start, unsigned int count)
+{
+ struct persistent_ram_buffer *buffer = prz->buffer;
+ uint8_t *buffer_end = buffer->data + prz->buffer_size;
+ uint8_t *block;
+ uint8_t *par;
+ int ecc_block_size = prz->ecc_block_size;
+ int ecc_size = prz->ecc_size;
+ int size = prz->ecc_block_size;
+
+ if (!prz->ecc)
+ return;
+
+ block = buffer->data + (start & ~(ecc_block_size - 1));
+ par = prz->par_buffer + (start / ecc_block_size) * prz->ecc_size;
+
+ do {
+ if (block + ecc_block_size > buffer_end)
+ size = buffer_end - block;
+ persistent_ram_encode_rs8(prz, block, size, par);
+ block += ecc_block_size;
+ par += ecc_size;
+ } while (block < buffer->data + start + count);
+}
+
+static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz)
+{
+ struct persistent_ram_buffer *buffer = prz->buffer;
+
+ if (!prz->ecc)
+ return;
+
+ persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer),
+ prz->par_header);
+}
+
+static void persistent_ram_ecc_old(struct persistent_ram_zone *prz)
+{
+ struct persistent_ram_buffer *buffer = prz->buffer;
+ uint8_t *block;
+ uint8_t *par;
+
+ if (!prz->ecc)
+ return;
+
+ block = buffer->data;
+ par = prz->par_buffer;
+ while (block < buffer->data + buffer_size(prz)) {
+ int numerr;
+ int size = prz->ecc_block_size;
+ if (block + size > buffer->data + prz->buffer_size)
+ size = buffer->data + prz->buffer_size - block;
+ numerr = persistent_ram_decode_rs8(prz, block, size, par);
+ if (numerr > 0) {
+ pr_devel("persistent_ram: error in block %p, %d\n",
+ block, numerr);
+ prz->corrected_bytes += numerr;
+ } else if (numerr < 0) {
+ pr_devel("persistent_ram: uncorrectable error in block %p\n",
+ block);
+ prz->bad_blocks++;
+ }
+ block += prz->ecc_block_size;
+ par += prz->ecc_size;
+ }
+}
+
+static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
+ size_t buffer_size)
+{
+ int numerr;
+ struct persistent_ram_buffer *buffer = prz->buffer;
+ int ecc_blocks;
+
+ if (!prz->ecc)
+ return 0;
+
+ prz->ecc_block_size = 128;
+ prz->ecc_size = 16;
+ prz->ecc_symsize = 8;
+ prz->ecc_poly = 0x11d;
+
+ ecc_blocks = DIV_ROUND_UP(prz->buffer_size, prz->ecc_block_size);
+ prz->buffer_size -= (ecc_blocks + 1) * prz->ecc_size;
+
+ if (prz->buffer_size > buffer_size) {
+ pr_err("persistent_ram: invalid size %zu, non-ecc datasize %zu\n",
+ buffer_size, prz->buffer_size);
+ return -EINVAL;
+ }
+
+ prz->par_buffer = buffer->data + prz->buffer_size;
+ prz->par_header = prz->par_buffer + ecc_blocks * prz->ecc_size;
+
+ /*
+ * first consecutive root is 0
+ * primitive element to generate roots = 1
+ */
+ prz->rs_decoder = init_rs(prz->ecc_symsize, prz->ecc_poly, 0, 1,
+ prz->ecc_size);
+ if (prz->rs_decoder == NULL) {
+ pr_info("persistent_ram: init_rs failed\n");
+ return -EINVAL;
+ }
+
+ prz->corrected_bytes = 0;
+ prz->bad_blocks = 0;
+
+ numerr = persistent_ram_decode_rs8(prz, buffer, sizeof(*buffer),
+ prz->par_header);
+ if (numerr > 0) {
+ pr_info("persistent_ram: error in header, %d\n", numerr);
+ prz->corrected_bytes += numerr;
+ } else if (numerr < 0) {
+ pr_info("persistent_ram: uncorrectable error in header\n");
+ prz->bad_blocks++;
+ }
+
+ return 0;
+}
+
+ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
+ char *str, size_t len)
+{
+ ssize_t ret;
+
+ if (prz->corrected_bytes || prz->bad_blocks)
+ ret = snprintf(str, len, ""
+ "\n%d Corrected bytes, %d unrecoverable blocks\n",
+ prz->corrected_bytes, prz->bad_blocks);
+ else
+ ret = snprintf(str, len, "\nNo errors detected\n");
+
+ return ret;
+}
+
+static void notrace persistent_ram_update(struct persistent_ram_zone *prz,
+ const void *s, unsigned int start, unsigned int count)
+{
+ struct persistent_ram_buffer *buffer = prz->buffer;
+ memcpy(buffer->data + start, s, count);
+ persistent_ram_update_ecc(prz, start, count);
+}
+
+static void __init
+persistent_ram_save_old(struct persistent_ram_zone *prz)
+{
+ struct persistent_ram_buffer *buffer = prz->buffer;
+ size_t size = buffer_size(prz);
+ size_t start = buffer_start(prz);
+ char *dest;
+
+ persistent_ram_ecc_old(prz);
+
+ dest = kmalloc(size, GFP_KERNEL);
+ if (dest == NULL) {
+ pr_err("persistent_ram: failed to allocate buffer\n");
+ return;
+ }
+
+ prz->old_log = dest;
+ prz->old_log_size = size;
+ memcpy(prz->old_log, &buffer->data[start], size - start);
+ memcpy(prz->old_log + size - start, &buffer->data[0], start);
+}
+
+int notrace persistent_ram_write(struct persistent_ram_zone *prz,
+ const void *s, unsigned int count)
+{
+ int rem;
+ int c = count;
+ size_t start;
+
+ if (unlikely(c > prz->buffer_size)) {
+ s += c - prz->buffer_size;
+ c = prz->buffer_size;
+ }
+
+ buffer_size_add(prz, c);
+
+ start = buffer_start_add(prz, c);
+
+ rem = prz->buffer_size - start;
+ if (unlikely(rem < c)) {
+ persistent_ram_update(prz, s, start, rem);
+ s += rem;
+ c -= rem;
+ start = 0;
+ }
+ persistent_ram_update(prz, s, start, c);
+
+ persistent_ram_update_header_ecc(prz);
+
+ return count;
+}
+
+size_t persistent_ram_old_size(struct persistent_ram_zone *prz)
+{
+ return prz->old_log_size;
+}
+
+void *persistent_ram_old(struct persistent_ram_zone *prz)
+{
+ return prz->old_log;
+}
+
+void persistent_ram_free_old(struct persistent_ram_zone *prz)
+{
+ kfree(prz->old_log);
+ prz->old_log = NULL;
+ prz->old_log_size = 0;
+}
+
+static void *persistent_ram_vmap(phys_addr_t start, size_t size)
+{
+ struct page **pages;
+ phys_addr_t page_start;
+ unsigned int page_count;
+ pgprot_t prot;
+ unsigned int i;
+ void *vaddr;
+
+ page_start = start - offset_in_page(start);
+ page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE);
+
+ prot = pgprot_noncached(PAGE_KERNEL);
+
+ pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL);
+ if (!pages) {
+ pr_err("%s: Failed to allocate array for %u pages\n", __func__,
+ page_count);
+ return NULL;
+ }
+
+ for (i = 0; i < page_count; i++) {
+ phys_addr_t addr = page_start + i * PAGE_SIZE;
+ pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
+ }
+ vaddr = vmap(pages, page_count, VM_MAP, prot);
+ kfree(pages);
+
+ return vaddr;
+}
+
+static void *persistent_ram_iomap(phys_addr_t start, size_t size)
+{
+ if (!request_mem_region(start, size, "persistent_ram")) {
+ pr_err("request mem region (0x%llx@0x%llx) failed\n",
+ (unsigned long long)size, (unsigned long long)start);
+ return NULL;
+ }
+
+ return ioremap(start, size);
+}
+
+static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
+ struct persistent_ram_zone *prz)
+{
+ prz->paddr = start;
+ prz->size = size;
+
+ if (pfn_valid(start >> PAGE_SHIFT))
+ prz->vaddr = persistent_ram_vmap(start, size);
+ else
+ prz->vaddr = persistent_ram_iomap(start, size);
+
+ if (!prz->vaddr) {
+ pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__,
+ (unsigned long long)size, (unsigned long long)start);
+ return -ENOMEM;
+ }
+
+ prz->buffer = prz->vaddr + offset_in_page(start);
+ prz->buffer_size = size - sizeof(struct persistent_ram_buffer);
+
+ return 0;
+}
+
+static int __init persistent_ram_post_init(struct persistent_ram_zone *prz, bool ecc)
+{
+ int ret;
+
+ prz->ecc = ecc;
+
+ ret = persistent_ram_init_ecc(prz, prz->buffer_size);
+ if (ret)
+ return ret;
+
+ if (prz->buffer->sig == PERSISTENT_RAM_SIG) {
+ if (buffer_size(prz) > prz->buffer_size ||
+ buffer_start(prz) > buffer_size(prz))
+ pr_info("persistent_ram: found existing invalid buffer,"
+ " size %zu, start %zu\n",
+ buffer_size(prz), buffer_start(prz));
+ else {
+ pr_info("persistent_ram: found existing buffer,"
+ " size %zu, start %zu\n",
+ buffer_size(prz), buffer_start(prz));
+ persistent_ram_save_old(prz);
+ }
+ } else {
+ pr_info("persistent_ram: no valid data in buffer"
+ " (sig = 0x%08x)\n", prz->buffer->sig);
+ }
+
+ prz->buffer->sig = PERSISTENT_RAM_SIG;
+ atomic_set(&prz->buffer->start, 0);
+ atomic_set(&prz->buffer->size, 0);
+
+ return 0;
+}
+
+void persistent_ram_free(struct persistent_ram_zone *prz)
+{
+ if (pfn_valid(prz->paddr >> PAGE_SHIFT)) {
+ vunmap(prz->vaddr);
+ } else {
+ iounmap(prz->vaddr);
+ release_mem_region(prz->paddr, prz->size);
+ }
+ persistent_ram_free_old(prz);
+ kfree(prz);
+}
+
+struct persistent_ram_zone * __init persistent_ram_new(phys_addr_t start,
+ size_t size,
+ bool ecc)
+{
+ struct persistent_ram_zone *prz;
+ int ret = -ENOMEM;
+
+ prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL);
+ if (!prz) {
+ pr_err("persistent_ram: failed to allocate persistent ram zone\n");
+ goto err;
+ }
+
+ ret = persistent_ram_buffer_map(start, size, prz);
+ if (ret)
+ goto err;
+
+ persistent_ram_post_init(prz, ecc);
+ persistent_ram_update_header_ecc(prz);
+
+ return prz;
+err:
+ kfree(prz);
+ return ERR_PTR(ret);
+}
+
+#ifndef MODULE
+static int __init persistent_ram_buffer_init(const char *name,
+ struct persistent_ram_zone *prz)
+{
+ int i;
+ struct persistent_ram *ram;
+ struct persistent_ram_descriptor *desc;
+ phys_addr_t start;
+
+ list_for_each_entry(ram, &persistent_ram_list, node) {
+ start = ram->start;
+ for (i = 0; i < ram->num_descs; i++) {
+ desc = &ram->descs[i];
+ if (!strcmp(desc->name, name))
+ return persistent_ram_buffer_map(start,
+ desc->size, prz);
+ start += desc->size;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static __init
+struct persistent_ram_zone *__persistent_ram_init(struct device *dev, bool ecc)
+{
+ struct persistent_ram_zone *prz;
+ int ret = -ENOMEM;
+
+ prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL);
+ if (!prz) {
+ pr_err("persistent_ram: failed to allocate persistent ram zone\n");
+ goto err;
+ }
+
+ ret = persistent_ram_buffer_init(dev_name(dev), prz);
+ if (ret) {
+ pr_err("persistent_ram: failed to initialize buffer\n");
+ goto err;
+ }
+
+ persistent_ram_post_init(prz, ecc);
+
+ return prz;
+err:
+ kfree(prz);
+ return ERR_PTR(ret);
+}
+
+struct persistent_ram_zone * __init
+persistent_ram_init_ringbuffer(struct device *dev, bool ecc)
+{
+ return __persistent_ram_init(dev, ecc);
+}
+
+int __init persistent_ram_early_init(struct persistent_ram *ram)
+{
+ int ret;
+
+ ret = memblock_reserve(ram->start, ram->size);
+ if (ret) {
+ pr_err("Failed to reserve persistent memory from %08lx-%08lx\n",
+ (long)ram->start, (long)(ram->start + ram->size - 1));
+ return ret;
+ }
+
+ list_add_tail(&ram->node, &persistent_ram_list);
+
+ pr_info("Initialized persistent memory from %08lx-%08lx\n",
+ (long)ram->start, (long)(ram->start + ram->size - 1));
+
+ return 0;
+}
+#endif
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index d69a1d1d7e1..10cbe841cb7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -116,15 +116,15 @@
* spinlock to internal buffers before writing.
*
* Lock ordering (including related VFS locks) is the following:
- * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock >
+ * dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock >
* dqio_mutex
+ * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc.
* The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
* dqptr_sem. But filesystem has to count with the fact that functions such as
* dquot_alloc_space() acquire dqptr_sem and they usually have to be called
* from inside a transaction to keep filesystem consistency after a crash. Also
* filesystems usually want to do some IO on dquot from ->mark_dirty which is
* called with dqptr_sem held.
- * i_mutex on quota files is special (it's below dqio_mutex)
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
@@ -638,7 +638,7 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait)
dqstats_inc(DQST_SYNCS);
mutex_unlock(&dqopt->dqonoff_mutex);
- if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
+ if (!wait || (dqopt->flags & DQUOT_QUOTA_SYS_FILE))
return 0;
/* This is not very clever (and fast) but currently I don't know about
@@ -652,18 +652,17 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait)
* Now when everything is written we can discard the pagecache so
* that userspace sees the changes.
*/
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ mutex_lock(&dqopt->dqonoff_mutex);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
continue;
if (!sb_has_quota_active(sb, cnt))
continue;
- mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
- I_MUTEX_QUOTA);
- truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
- mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
+ mutex_lock(&dqopt->files[cnt]->i_mutex);
+ truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
+ mutex_unlock(&dqopt->files[cnt]->i_mutex);
}
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+ mutex_unlock(&dqopt->dqonoff_mutex);
return 0;
}
@@ -907,14 +906,14 @@ static void add_dquot_ref(struct super_block *sb, int type)
spin_unlock(&inode->i_lock);
continue;
}
-#ifdef CONFIG_QUOTA_DEBUG
- if (unlikely(inode_get_rsv_space(inode) > 0))
- reserved = 1;
-#endif
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_sb_list_lock);
+#ifdef CONFIG_QUOTA_DEBUG
+ if (unlikely(inode_get_rsv_space(inode) > 0))
+ reserved = 1;
+#endif
iput(old_inode);
__dquot_initialize(inode, type);
@@ -2037,8 +2036,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
/* If quota was reenabled in the meantime, we have
* nothing to do */
if (!sb_has_quota_loaded(sb, cnt)) {
- mutex_lock_nested(&toputinode[cnt]->i_mutex,
- I_MUTEX_QUOTA);
+ mutex_lock(&toputinode[cnt]->i_mutex);
toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
S_NOATIME | S_NOQUOTA);
truncate_inode_pages(&toputinode[cnt]->i_data,
@@ -2133,7 +2131,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
/* We don't want quota and atime on quota files (deadlocks
* possible) Also nobody should write to the file - we use
* special IO operations which ignore the immutable bit. */
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+ mutex_lock(&inode->i_mutex);
oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
S_NOQUOTA);
inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
@@ -2180,7 +2178,7 @@ out_file_init:
iput(inode);
out_lock:
if (oldflags != -1) {
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+ mutex_lock(&inode->i_mutex);
/* Set the flags back (in the case of accidental quotaon()
* on a wrong file we don't want to mess up the flags) */
inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 8b7616ef06d..c07b7d70944 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2270,7 +2270,6 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
while (towrite > 0) {
tocopy = sb->s_blocksize - offset < towrite ?
sb->s_blocksize - offset : towrite;
@@ -2302,16 +2301,13 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite) {
- mutex_unlock(&inode->i_mutex);
+ if (len == towrite)
return err;
- }
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
inode->i_version++;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
- mutex_unlock(&inode->i_mutex);
return len - towrite;
}
diff --git a/fs/stat.c b/fs/stat.c
index c733dc5753a..b6ff11825fc 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,12 +57,13 @@ EXPORT_SYMBOL(vfs_getattr);
int vfs_fstat(unsigned int fd, struct kstat *stat)
{
- struct file *f = fget(fd);
+ int fput_needed;
+ struct file *f = fget_light(fd, &fput_needed);
int error = -EBADF;
if (f) {
error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
- fput(f);
+ fput_light(f, fput_needed);
}
return error;
}
@@ -137,8 +138,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
tmp.st_nlink = stat->nlink;
if (tmp.st_nlink != stat->nlink)
return -EOVERFLOW;
- SET_UID(tmp.st_uid, stat->uid);
- SET_GID(tmp.st_gid, stat->gid);
+ SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+ SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
tmp.st_rdev = old_encode_dev(stat->rdev);
#if BITS_PER_LONG == 32
if (stat->size > MAX_NON_LFS)
@@ -190,24 +191,32 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
#endif /* __ARCH_WANT_OLD_STAT */
+#if BITS_PER_LONG == 32
+# define choose_32_64(a,b) a
+#else
+# define choose_32_64(a,b) b
+#endif
+
+#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
+
+#ifndef INIT_STRUCT_STAT_PADDING
+# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
+#endif
+
static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
struct stat tmp;
-#if BITS_PER_LONG == 32
- if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
return -EOVERFLOW;
-#else
- if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
+#if BITS_PER_LONG == 32
+ if (stat->size > MAX_NON_LFS)
return -EOVERFLOW;
#endif
- memset(&tmp, 0, sizeof(tmp));
-#if BITS_PER_LONG == 32
- tmp.st_dev = old_encode_dev(stat->dev);
-#else
- tmp.st_dev = new_encode_dev(stat->dev);
-#endif
+ INIT_STRUCT_STAT_PADDING(tmp);
+ tmp.st_dev = encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -215,17 +224,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
tmp.st_nlink = stat->nlink;
if (tmp.st_nlink != stat->nlink)
return -EOVERFLOW;
- SET_UID(tmp.st_uid, stat->uid);
- SET_GID(tmp.st_gid, stat->gid);
-#if BITS_PER_LONG == 32
- tmp.st_rdev = old_encode_dev(stat->rdev);
-#else
- tmp.st_rdev = new_encode_dev(stat->rdev);
-#endif
-#if BITS_PER_LONG == 32
- if (stat->size > MAX_NON_LFS)
- return -EOVERFLOW;
-#endif
+ SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+ SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+ tmp.st_rdev = encode_dev(stat->rdev);
tmp.st_size = stat->size;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_mtime = stat->mtime.tv_sec;
@@ -327,11 +328,15 @@ SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
/* ---------- LFS-64 ----------- */
#ifdef __ARCH_WANT_STAT64
+#ifndef INIT_STRUCT_STAT64_PADDING
+# define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st))
+#endif
+
static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
{
struct stat64 tmp;
- memset(&tmp, 0, sizeof(struct stat64));
+ INIT_STRUCT_STAT64_PADDING(tmp);
#ifdef CONFIG_MIPS
/* mips has weird padding, so we don't get 64 bits there */
if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
@@ -350,8 +355,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
#endif
tmp.st_mode = stat->mode;
tmp.st_nlink = stat->nlink;
- tmp.st_uid = stat->uid;
- tmp.st_gid = stat->gid;
+ tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
+ tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
tmp.st_atime = stat->atime.tv_sec;
tmp.st_atime_nsec = stat->atime.tv_nsec;
tmp.st_mtime = stat->mtime.tv_sec;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 35a36d39fa2..e6bb9b2a4cb 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -132,6 +132,24 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+/* Test for attributes that want to ignore lockdep for read-locking */
+static bool ignore_lockdep(struct sysfs_dirent *sd)
+{
+ return sysfs_type(sd) == SYSFS_KOBJ_ATTR &&
+ sd->s_attr.attr->ignore_lockdep;
+}
+
+#else
+
+static inline bool ignore_lockdep(struct sysfs_dirent *sd)
+{
+ return true;
+}
+
+#endif
+
/**
* sysfs_get_active - get an active reference to sysfs_dirent
* @sd: sysfs_dirent to get an active reference to
@@ -155,15 +173,17 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
return NULL;
t = atomic_cmpxchg(&sd->s_active, v, v + 1);
- if (likely(t == v)) {
- rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
- return sd;
- }
+ if (likely(t == v))
+ break;
if (t < 0)
return NULL;
cpu_relax();
}
+
+ if (likely(!ignore_lockdep(sd)))
+ rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
+ return sd;
}
/**
@@ -180,7 +200,8 @@ void sysfs_put_active(struct sysfs_dirent *sd)
if (unlikely(!sd))
return;
- rwsem_release(&sd->dep_map, 1, _RET_IP_);
+ if (likely(!ignore_lockdep(sd)))
+ rwsem_release(&sd->dep_map, 1, _RET_IP_);
v = atomic_dec_return(&sd->s_active);
if (likely(v != SD_DEACTIVATED_BIAS))
return;
@@ -858,7 +879,6 @@ int sysfs_rename(struct sysfs_dirent *sd,
struct sysfs_dirent *new_parent_sd, const void *new_ns,
const char *new_name)
{
- const char *dup_name = NULL;
int error;
mutex_lock(&sysfs_mutex);
@@ -875,11 +895,11 @@ int sysfs_rename(struct sysfs_dirent *sd,
/* rename sysfs_dirent */
if (strcmp(sd->s_name, new_name) != 0) {
error = -ENOMEM;
- new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
+ new_name = kstrdup(new_name, GFP_KERNEL);
if (!new_name)
goto out;
- dup_name = sd->s_name;
+ kfree(sd->s_name);
sd->s_name = new_name;
}
@@ -895,7 +915,6 @@ int sysfs_rename(struct sysfs_dirent *sd,
error = 0;
out:
mutex_unlock(&sysfs_mutex);
- kfree(dup_name);
return error;
}
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index feb2d69396c..907c2b3af75 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -62,8 +62,8 @@ static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
/* assign default attributes */
iattrs->ia_mode = sd->s_mode;
- iattrs->ia_uid = 0;
- iattrs->ia_gid = 0;
+ iattrs->ia_uid = GLOBAL_ROOT_UID;
+ iattrs->ia_gid = GLOBAL_ROOT_GID;
iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
return attrs;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index f8b0160da2d..ba66d508006 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -11,12 +11,6 @@ config UBIFS_FS
help
UBIFS is a file system for flash devices which works on top of UBI.
-config UBIFS_FS_XATTR
- bool "Extended attributes support"
- depends on UBIFS_FS
- help
- This option enables support of extended attributes.
-
config UBIFS_FS_ADVANCED_COMPR
bool "Advanced compression options"
depends on UBIFS_FS
@@ -41,20 +35,3 @@ config UBIFS_FS_ZLIB
default y
help
Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
-
-# Debugging-related stuff
-config UBIFS_FS_DEBUG
- bool "Enable debugging support"
- depends on UBIFS_FS
- select DEBUG_FS
- select KALLSYMS
- help
- This option enables UBIFS debugging support. It makes sure various
- assertions, self-checks, debugging messages and test modes are compiled
- in (this all is compiled out otherwise). Assertions are light-weight
- and this option also enables them. Self-checks, debugging messages and
- test modes are switched off by default. Thus, it is safe and actually
- recommended to have debugging support enabled, and it should not slow
- down UBIFS. You can then further enable / disable individual debugging
- features using UBIFS module parameters and the corresponding sysfs
- interfaces.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 80e93c35e49..2c6f0cb816b 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -3,7 +3,4 @@ obj-$(CONFIG_UBIFS_FS) += ubifs.o
ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
-ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o
-
-ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o
-ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
+ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index fb3b5c813a3..8eda717cb99 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -496,7 +496,9 @@ int ubifs_gc_should_commit(struct ubifs_info *c)
return ret;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
+/*
+ * Everything below is related to debugging.
+ */
/**
* struct idx_node - hold index nodes during index tree traversal.
@@ -714,14 +716,14 @@ out:
return 0;
out_dump:
- dbg_err("dumping index node (iip=%d)", i->iip);
- dbg_dump_node(c, idx);
+ ubifs_err("dumping index node (iip=%d)", i->iip);
+ ubifs_dump_node(c, idx);
list_del(&i->list);
kfree(i);
if (!list_empty(&list)) {
i = list_entry(list.prev, struct idx_node, list);
- dbg_err("dumping parent index node");
- dbg_dump_node(c, &i->idx);
+ ubifs_err("dumping parent index node");
+ ubifs_dump_node(c, &i->idx);
}
out_free:
while (!list_empty(&list)) {
@@ -734,5 +736,3 @@ out_free:
err = -EINVAL;
return err;
}
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 1934084e208..685a83756b2 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,8 +34,6 @@
#include <linux/random.h>
#include "ubifs.h"
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
static DEFINE_SPINLOCK(dbg_lock);
static const char *get_key_fmt(int fmt)
@@ -232,7 +230,7 @@ static void dump_ch(const struct ubifs_ch *ch)
printk(KERN_ERR "\tlen %u\n", le32_to_cpu(ch->len));
}
-void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
+void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
{
const struct ubifs_inode *ui = ubifs_inode(inode);
struct qstr nm = { .name = NULL };
@@ -300,7 +298,7 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
kfree(pdent);
}
-void dbg_dump_node(const struct ubifs_info *c, const void *node)
+void ubifs_dump_node(const struct ubifs_info *c, const void *node)
{
int i, n;
union ubifs_key key;
@@ -603,7 +601,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
spin_unlock(&dbg_lock);
}
-void dbg_dump_budget_req(const struct ubifs_budget_req *req)
+void ubifs_dump_budget_req(const struct ubifs_budget_req *req)
{
spin_lock(&dbg_lock);
printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n",
@@ -620,7 +618,7 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)
spin_unlock(&dbg_lock);
}
-void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
+void ubifs_dump_lstats(const struct ubifs_lp_stats *lst)
{
spin_lock(&dbg_lock);
printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, "
@@ -634,7 +632,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
spin_unlock(&dbg_lock);
}
-void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
+void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
{
int i;
struct rb_node *rb;
@@ -707,7 +705,7 @@ out_unlock:
spin_unlock(&c->space_lock);
}
-void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
+void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
{
int i, spc, dark = 0, dead = 0;
struct rb_node *rb;
@@ -801,7 +799,7 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
printk(KERN_CONT ")\n");
}
-void dbg_dump_lprops(struct ubifs_info *c)
+void ubifs_dump_lprops(struct ubifs_info *c)
{
int lnum, err;
struct ubifs_lprops lp;
@@ -810,20 +808,20 @@ void dbg_dump_lprops(struct ubifs_info *c)
printk(KERN_ERR "(pid %d) start dumping LEB properties\n",
current->pid);
ubifs_get_lp_stats(c, &lst);
- dbg_dump_lstats(&lst);
+ ubifs_dump_lstats(&lst);
for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
err = ubifs_read_one_lp(c, lnum, &lp);
if (err)
ubifs_err("cannot read lprops for LEB %d", lnum);
- dbg_dump_lprop(c, &lp);
+ ubifs_dump_lprop(c, &lp);
}
printk(KERN_ERR "(pid %d) finish dumping LEB properties\n",
current->pid);
}
-void dbg_dump_lpt_info(struct ubifs_info *c)
+void ubifs_dump_lpt_info(struct ubifs_info *c)
{
int i;
@@ -862,8 +860,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
spin_unlock(&dbg_lock);
}
-void dbg_dump_sleb(const struct ubifs_info *c,
- const struct ubifs_scan_leb *sleb, int offs)
+void ubifs_dump_sleb(const struct ubifs_info *c,
+ const struct ubifs_scan_leb *sleb, int offs)
{
struct ubifs_scan_node *snod;
@@ -874,11 +872,11 @@ void dbg_dump_sleb(const struct ubifs_info *c,
cond_resched();
printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
snod->offs, snod->len);
- dbg_dump_node(c, snod->node);
+ ubifs_dump_node(c, snod->node);
}
}
-void dbg_dump_leb(const struct ubifs_info *c, int lnum)
+void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
{
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
@@ -909,7 +907,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
cond_resched();
printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum,
snod->offs, snod->len);
- dbg_dump_node(c, snod->node);
+ ubifs_dump_node(c, snod->node);
}
printk(KERN_ERR "(pid %d) finish dumping LEB %d\n",
@@ -921,8 +919,8 @@ out:
return;
}
-void dbg_dump_znode(const struct ubifs_info *c,
- const struct ubifs_znode *znode)
+void ubifs_dump_znode(const struct ubifs_info *c,
+ const struct ubifs_znode *znode)
{
int n;
const struct ubifs_zbranch *zbr;
@@ -965,7 +963,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
spin_unlock(&dbg_lock);
}
-void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
+void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
{
int i;
@@ -981,8 +979,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid);
}
-void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
- struct ubifs_nnode *parent, int iip)
+void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ struct ubifs_nnode *parent, int iip)
{
int i;
@@ -999,7 +997,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
}
}
-void dbg_dump_tnc(struct ubifs_info *c)
+void ubifs_dump_tnc(struct ubifs_info *c)
{
struct ubifs_znode *znode;
int level;
@@ -1014,7 +1012,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
level = znode->level;
printk(KERN_ERR "== Level %d ==\n", level);
}
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
}
printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid);
@@ -1023,18 +1021,18 @@ void dbg_dump_tnc(struct ubifs_info *c)
static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
void *priv)
{
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
return 0;
}
/**
- * dbg_dump_index - dump the on-flash index.
+ * ubifs_dump_index - dump the on-flash index.
* @c: UBIFS file-system description object
*
- * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()'
+ * This function dumps whole UBIFS indexing B-tree, unlike 'ubifs_dump_tnc()'
* which dumps only in-memory znodes and does not read znodes which from flash.
*/
-void dbg_dump_index(struct ubifs_info *c)
+void ubifs_dump_index(struct ubifs_info *c)
{
dbg_walk_index(c, NULL, dump_znode, NULL);
}
@@ -1120,15 +1118,15 @@ int dbg_check_space_info(struct ubifs_info *c)
out:
ubifs_msg("saved lprops statistics dump");
- dbg_dump_lstats(&d->saved_lst);
+ ubifs_dump_lstats(&d->saved_lst);
ubifs_msg("saved budgeting info dump");
- dbg_dump_budg(c, &d->saved_bi);
+ ubifs_dump_budg(c, &d->saved_bi);
ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
ubifs_msg("current lprops statistics dump");
ubifs_get_lp_stats(c, &lst);
- dbg_dump_lstats(&lst);
+ ubifs_dump_lstats(&lst);
ubifs_msg("current budgeting info dump");
- dbg_dump_budg(c, &c->bi);
+ ubifs_dump_budg(c, &c->bi);
dump_stack();
return -EINVAL;
}
@@ -1160,7 +1158,7 @@ int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode)
"is clean", ui->ui_size, ui->synced_i_size);
ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
inode->i_mode, i_size_read(inode));
- dbg_dump_stack();
+ dump_stack();
err = -EINVAL;
}
spin_unlock(&ui->ui_lock);
@@ -1223,14 +1221,14 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
"but calculated size is %llu", dir->i_ino,
(unsigned long long)i_size_read(dir),
(unsigned long long)size);
- dbg_dump_inode(c, dir);
+ ubifs_dump_inode(c, dir);
dump_stack();
return -EINVAL;
}
if (dir->i_nlink != nlink) {
ubifs_err("directory inode %lu has nlink %u, but calculated "
"nlink is %u", dir->i_ino, dir->i_nlink, nlink);
- dbg_dump_inode(c, dir);
+ ubifs_dump_inode(c, dir);
dump_stack();
return -EINVAL;
}
@@ -1287,25 +1285,25 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
err = 1;
key_read(c, &dent1->key, &key);
if (keys_cmp(c, &zbr1->key, &key)) {
- dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
- zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
- DBG_KEY_BUF_LEN));
- dbg_err("but it should have key %s according to tnc",
- dbg_snprintf_key(c, &zbr1->key, key_buf,
- DBG_KEY_BUF_LEN));
- dbg_dump_node(c, dent1);
+ ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum,
+ zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
+ ubifs_err("but it should have key %s according to tnc",
+ dbg_snprintf_key(c, &zbr1->key, key_buf,
+ DBG_KEY_BUF_LEN));
+ ubifs_dump_node(c, dent1);
goto out_free;
}
key_read(c, &dent2->key, &key);
if (keys_cmp(c, &zbr2->key, &key)) {
- dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
- zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
- DBG_KEY_BUF_LEN));
- dbg_err("but it should have key %s according to tnc",
- dbg_snprintf_key(c, &zbr2->key, key_buf,
- DBG_KEY_BUF_LEN));
- dbg_dump_node(c, dent2);
+ ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+ zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
+ ubifs_err("but it should have key %s according to tnc",
+ dbg_snprintf_key(c, &zbr2->key, key_buf,
+ DBG_KEY_BUF_LEN));
+ ubifs_dump_node(c, dent2);
goto out_free;
}
@@ -1318,15 +1316,15 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
goto out_free;
}
if (cmp == 0 && nlen1 == nlen2)
- dbg_err("2 xent/dent nodes with the same name");
+ ubifs_err("2 xent/dent nodes with the same name");
else
- dbg_err("bad order of colliding key %s",
- dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+ ubifs_err("bad order of colliding key %s",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
- dbg_dump_node(c, dent1);
+ ubifs_dump_node(c, dent1);
ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
- dbg_dump_node(c, dent2);
+ ubifs_dump_node(c, dent2);
out_free:
kfree(dent2);
@@ -1529,10 +1527,10 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
out:
ubifs_err("failed, error %d", err);
ubifs_msg("dump of the znode");
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
if (zp) {
ubifs_msg("dump of the parent znode");
- dbg_dump_znode(c, zp);
+ ubifs_dump_znode(c, zp);
}
dump_stack();
return -EINVAL;
@@ -1599,9 +1597,9 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
return err;
if (err) {
ubifs_msg("first znode");
- dbg_dump_znode(c, prev);
+ ubifs_dump_znode(c, prev);
ubifs_msg("second znode");
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
return -EINVAL;
}
}
@@ -1690,7 +1688,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
if (err) {
ubifs_err("znode checking function returned "
"error %d", err);
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
goto out_dump;
}
}
@@ -1758,7 +1756,7 @@ out_dump:
else
zbr = &c->zroot;
ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
out_unlock:
mutex_unlock(&c->tnc_mutex);
return err;
@@ -2194,7 +2192,7 @@ out:
out_dump:
ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
- dbg_dump_node(c, node);
+ ubifs_dump_node(c, node);
out_free:
kfree(node);
return err;
@@ -2352,7 +2350,7 @@ out_dump:
ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
(unsigned long)fscki->inum, zbr->lnum, zbr->offs);
- dbg_dump_node(c, ino);
+ ubifs_dump_node(c, ino);
kfree(ino);
return -EINVAL;
}
@@ -2423,12 +2421,12 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
if (sa->type != UBIFS_DATA_NODE) {
ubifs_err("bad node type %d", sa->type);
- dbg_dump_node(c, sa->node);
+ ubifs_dump_node(c, sa->node);
return -EINVAL;
}
if (sb->type != UBIFS_DATA_NODE) {
ubifs_err("bad node type %d", sb->type);
- dbg_dump_node(c, sb->node);
+ ubifs_dump_node(c, sb->node);
return -EINVAL;
}
@@ -2459,8 +2457,8 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
return 0;
error_dump:
- dbg_dump_node(c, sa->node);
- dbg_dump_node(c, sb->node);
+ ubifs_dump_node(c, sa->node);
+ ubifs_dump_node(c, sb->node);
return -EINVAL;
}
@@ -2491,13 +2489,13 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
sa->type != UBIFS_XENT_NODE) {
ubifs_err("bad node type %d", sa->type);
- dbg_dump_node(c, sa->node);
+ ubifs_dump_node(c, sa->node);
return -EINVAL;
}
if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
sa->type != UBIFS_XENT_NODE) {
ubifs_err("bad node type %d", sb->type);
- dbg_dump_node(c, sb->node);
+ ubifs_dump_node(c, sb->node);
return -EINVAL;
}
@@ -2547,9 +2545,9 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
error_dump:
ubifs_msg("dumping first node");
- dbg_dump_node(c, sa->node);
+ ubifs_dump_node(c, sa->node);
ubifs_msg("dumping second node");
- dbg_dump_node(c, sb->node);
+ ubifs_dump_node(c, sb->node);
return -EINVAL;
return 0;
}
@@ -2678,7 +2676,7 @@ static void cut_data(const void *buf, unsigned int len)
}
int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
- int offs, int len, int dtype)
+ int offs, int len)
{
int err, failing;
@@ -2688,7 +2686,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
failing = power_cut_emulated(c, lnum, 1);
if (failing)
cut_data(buf, len);
- err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+ err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
if (err)
return err;
if (failing)
@@ -2697,7 +2695,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
}
int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
- int len, int dtype)
+ int len)
{
int err;
@@ -2705,7 +2703,7 @@ int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
return -EROFS;
if (power_cut_emulated(c, lnum, 1))
return -EROFS;
- err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+ err = ubi_leb_change(c->ubi, lnum, buf, len);
if (err)
return err;
if (power_cut_emulated(c, lnum, 1))
@@ -2729,7 +2727,7 @@ int dbg_leb_unmap(struct ubifs_info *c, int lnum)
return 0;
}
-int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype)
+int dbg_leb_map(struct ubifs_info *c, int lnum)
{
int err;
@@ -2737,7 +2735,7 @@ int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype)
return -EROFS;
if (power_cut_emulated(c, lnum, 0))
return -EROFS;
- err = ubi_leb_map(c->ubi, lnum, dtype);
+ err = ubi_leb_map(c->ubi, lnum);
if (err)
return err;
if (power_cut_emulated(c, lnum, 0))
@@ -2857,16 +2855,16 @@ static ssize_t dfs_file_write(struct file *file, const char __user *u,
* 'ubifs-debug' file-system instead.
*/
if (file->f_path.dentry == d->dfs_dump_lprops) {
- dbg_dump_lprops(c);
+ ubifs_dump_lprops(c);
return count;
}
if (file->f_path.dentry == d->dfs_dump_budg) {
- dbg_dump_budg(c, &c->bi);
+ ubifs_dump_budg(c, &c->bi);
return count;
}
if (file->f_path.dentry == d->dfs_dump_tnc) {
mutex_lock(&c->tnc_mutex);
- dbg_dump_tnc(c);
+ ubifs_dump_tnc(c);
mutex_unlock(&c->tnc_mutex);
return count;
}
@@ -3189,5 +3187,3 @@ void ubifs_debugging_exit(struct ubifs_info *c)
{
kfree(c->dbg);
}
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 9f717655df1..486a8e024fb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -29,8 +29,6 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
typedef int (*dbg_znode_callback)(struct ubifs_info *c,
struct ubifs_znode *znode, void *priv);
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
/*
* The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi"
* + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte.
@@ -149,7 +147,7 @@ struct ubifs_global_debug_info {
if (unlikely(!(expr))) { \
printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
__func__, __LINE__, current->pid); \
- dbg_dump_stack(); \
+ dump_stack(); \
} \
} while (0)
@@ -161,12 +159,6 @@ struct ubifs_global_debug_info {
} \
} while (0)
-#define dbg_dump_stack() dump_stack()
-
-#define dbg_err(fmt, ...) do { \
- ubifs_err(fmt, ##__VA_ARGS__); \
-} while (0)
-
#define ubifs_dbg_msg(type, fmt, ...) \
pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
@@ -257,27 +249,27 @@ const char *dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key);
const char *dbg_snprintf_key(const struct ubifs_info *c,
const union ubifs_key *key, char *buffer, int len);
-void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
-void dbg_dump_node(const struct ubifs_info *c, const void *node);
-void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
- int offs);
-void dbg_dump_budget_req(const struct ubifs_budget_req *req);
-void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
-void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
-void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
-void dbg_dump_lprops(struct ubifs_info *c);
-void dbg_dump_lpt_info(struct ubifs_info *c);
-void dbg_dump_leb(const struct ubifs_info *c, int lnum);
-void dbg_dump_sleb(const struct ubifs_info *c,
- const struct ubifs_scan_leb *sleb, int offs);
-void dbg_dump_znode(const struct ubifs_info *c,
- const struct ubifs_znode *znode);
-void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
-void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
- struct ubifs_nnode *parent, int iip);
-void dbg_dump_tnc(struct ubifs_info *c);
-void dbg_dump_index(struct ubifs_info *c);
-void dbg_dump_lpt_lebs(const struct ubifs_info *c);
+void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode);
+void ubifs_dump_node(const struct ubifs_info *c, const void *node);
+void ubifs_dump_budget_req(const struct ubifs_budget_req *req);
+void ubifs_dump_lstats(const struct ubifs_lp_stats *lst);
+void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
+void ubifs_dump_lprop(const struct ubifs_info *c,
+ const struct ubifs_lprops *lp);
+void ubifs_dump_lprops(struct ubifs_info *c);
+void ubifs_dump_lpt_info(struct ubifs_info *c);
+void ubifs_dump_leb(const struct ubifs_info *c, int lnum);
+void ubifs_dump_sleb(const struct ubifs_info *c,
+ const struct ubifs_scan_leb *sleb, int offs);
+void ubifs_dump_znode(const struct ubifs_info *c,
+ const struct ubifs_znode *znode);
+void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+ int cat);
+void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ struct ubifs_nnode *parent, int iip);
+void ubifs_dump_tnc(struct ubifs_info *c);
+void ubifs_dump_index(struct ubifs_info *c);
+void ubifs_dump_lpt_lebs(const struct ubifs_info *c);
int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
dbg_znode_callback znode_cb, void *priv);
@@ -307,11 +299,10 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
- int len, int dtype);
-int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
- int dtype);
+ int len);
+int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len);
int dbg_leb_unmap(struct ubifs_info *c, int lnum);
-int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype);
+int dbg_leb_map(struct ubifs_info *c, int lnum);
/* Debugfs-related stuff */
int dbg_debugfs_init(void);
@@ -319,162 +310,4 @@ void dbg_debugfs_exit(void);
int dbg_debugfs_init_fs(struct ubifs_info *c);
void dbg_debugfs_exit_fs(struct ubifs_info *c);
-#else /* !CONFIG_UBIFS_FS_DEBUG */
-
-/* Use "if (0)" to make compiler check arguments even if debugging is off */
-#define ubifs_assert(expr) do { \
- if (0) \
- printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
- __func__, __LINE__, current->pid); \
-} while (0)
-
-#define dbg_err(fmt, ...) do { \
- if (0) \
- ubifs_err(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define DBGKEY(key) ((char *)(key))
-#define DBGKEY1(key) ((char *)(key))
-
-#define ubifs_dbg_msg(fmt, ...) do { \
- if (0) \
- printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \
-} while (0)
-
-#define dbg_dump_stack()
-#define ubifs_assert_cmt_locked(c)
-
-#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-
-static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; }
-static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; }
-static inline const char *dbg_ntype(int type) { return ""; }
-static inline const char *dbg_cstate(int cmt_state) { return ""; }
-static inline const char *dbg_jhead(int jhead) { return ""; }
-static inline const char *
-dbg_get_key_dump(const struct ubifs_info *c,
- const union ubifs_key *key) { return ""; }
-static inline const char *
-dbg_snprintf_key(const struct ubifs_info *c,
- const union ubifs_key *key, char *buffer,
- int len) { return ""; }
-static inline void dbg_dump_inode(struct ubifs_info *c,
- const struct inode *inode) { return; }
-static inline void dbg_dump_node(const struct ubifs_info *c,
- const void *node) { return; }
-static inline void dbg_dump_lpt_node(const struct ubifs_info *c,
- void *node, int lnum,
- int offs) { return; }
-static inline void
-dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; }
-static inline void
-dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; }
-static inline void
-dbg_dump_budg(struct ubifs_info *c,
- const struct ubifs_budg_info *bi) { return; }
-static inline void dbg_dump_lprop(const struct ubifs_info *c,
- const struct ubifs_lprops *lp) { return; }
-static inline void dbg_dump_lprops(struct ubifs_info *c) { return; }
-static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; }
-static inline void dbg_dump_leb(const struct ubifs_info *c,
- int lnum) { return; }
-static inline void
-dbg_dump_sleb(const struct ubifs_info *c,
- const struct ubifs_scan_leb *sleb, int offs) { return; }
-static inline void
-dbg_dump_znode(const struct ubifs_info *c,
- const struct ubifs_znode *znode) { return; }
-static inline void dbg_dump_heap(struct ubifs_info *c,
- struct ubifs_lpt_heap *heap,
- int cat) { return; }
-static inline void dbg_dump_pnode(struct ubifs_info *c,
- struct ubifs_pnode *pnode,
- struct ubifs_nnode *parent,
- int iip) { return; }
-static inline void dbg_dump_tnc(struct ubifs_info *c) { return; }
-static inline void dbg_dump_index(struct ubifs_info *c) { return; }
-static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c) { return; }
-
-static inline int dbg_walk_index(struct ubifs_info *c,
- dbg_leaf_callback leaf_cb,
- dbg_znode_callback znode_cb,
- void *priv) { return 0; }
-static inline void dbg_save_space_info(struct ubifs_info *c) { return; }
-static inline int dbg_check_space_info(struct ubifs_info *c) { return 0; }
-static inline int dbg_check_lprops(struct ubifs_info *c) { return 0; }
-static inline int
-dbg_old_index_check_init(struct ubifs_info *c,
- struct ubifs_zbranch *zroot) { return 0; }
-static inline int
-dbg_check_old_index(struct ubifs_info *c,
- struct ubifs_zbranch *zroot) { return 0; }
-static inline int dbg_check_cats(struct ubifs_info *c) { return 0; }
-static inline int dbg_check_ltab(struct ubifs_info *c) { return 0; }
-static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c) { return 0; }
-static inline int dbg_chk_lpt_sz(struct ubifs_info *c,
- int action, int len) { return 0; }
-static inline int
-dbg_check_synced_i_size(const struct ubifs_info *c,
- struct inode *inode) { return 0; }
-static inline int dbg_check_dir(struct ubifs_info *c,
- const struct inode *dir) { return 0; }
-static inline int dbg_check_tnc(struct ubifs_info *c, int extra) { return 0; }
-static inline int dbg_check_idx_size(struct ubifs_info *c,
- long long idx_size) { return 0; }
-static inline int dbg_check_filesystem(struct ubifs_info *c) { return 0; }
-static inline void dbg_check_heap(struct ubifs_info *c,
- struct ubifs_lpt_heap *heap,
- int cat, int add_pos) { return; }
-static inline int dbg_check_lpt_nodes(struct ubifs_info *c,
- struct ubifs_cnode *cnode, int row, int col) { return 0; }
-static inline int dbg_check_inode_size(struct ubifs_info *c,
- const struct inode *inode,
- loff_t size) { return 0; }
-static inline int
-dbg_check_data_nodes_order(struct ubifs_info *c,
- struct list_head *head) { return 0; }
-static inline int
-dbg_check_nondata_nodes_order(struct ubifs_info *c,
- struct list_head *head) { return 0; }
-
-static inline int dbg_leb_write(struct ubifs_info *c, int lnum,
- const void *buf, int offset,
- int len, int dtype) { return 0; }
-static inline int dbg_leb_change(struct ubifs_info *c, int lnum,
- const void *buf, int len,
- int dtype) { return 0; }
-static inline int dbg_leb_unmap(struct ubifs_info *c, int lnum) { return 0; }
-static inline int dbg_leb_map(struct ubifs_info *c, int lnum,
- int dtype) { return 0; }
-
-static inline int dbg_is_chk_gen(const struct ubifs_info *c) { return 0; }
-static inline int dbg_is_chk_index(const struct ubifs_info *c) { return 0; }
-static inline int dbg_is_chk_orph(const struct ubifs_info *c) { return 0; }
-static inline int dbg_is_chk_lprops(const struct ubifs_info *c) { return 0; }
-static inline int dbg_is_chk_fs(const struct ubifs_info *c) { return 0; }
-static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) { return 0; }
-static inline int dbg_is_power_cut(const struct ubifs_info *c) { return 0; }
-
-static inline int dbg_debugfs_init(void) { return 0; }
-static inline void dbg_debugfs_exit(void) { return; }
-static inline int dbg_debugfs_init_fs(struct ubifs_info *c) { return 0; }
-static inline int dbg_debugfs_exit_fs(struct ubifs_info *c) { return 0; }
-
-#endif /* !CONFIG_UBIFS_FS_DEBUG */
#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ec9f1870ab7..62a2727f4ec 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -170,8 +170,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
return inode;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
static int dbg_check_name(const struct ubifs_info *c,
const struct ubifs_dent_node *dent,
const struct qstr *nm)
@@ -185,12 +183,6 @@ static int dbg_check_name(const struct ubifs_info *c,
return 0;
}
-#else
-
-#define dbg_check_name(c, dent, nm) 0
-
-#endif
-
static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
@@ -1187,12 +1179,10 @@ const struct inode_operations ubifs_dir_inode_operations = {
.rename = ubifs_rename,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
-#ifdef CONFIG_UBIFS_FS_XATTR
.setxattr = ubifs_setxattr,
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
-#endif
};
const struct file_operations ubifs_dir_operations = {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5c8f6dc1d28..35389ca2d26 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -97,7 +97,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
dump:
ubifs_err("bad data node (block %u, inode %lu)",
block, inode->i_ino);
- dbg_dump_node(c, dn);
+ ubifs_dump_node(c, dn);
return -EINVAL;
}
@@ -1562,12 +1562,10 @@ const struct address_space_operations ubifs_file_address_operations = {
const struct inode_operations ubifs_file_inode_operations = {
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
-#ifdef CONFIG_UBIFS_FS_XATTR
.setxattr = ubifs_setxattr,
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
-#endif
};
const struct inode_operations ubifs_symlink_inode_operations = {
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index ded29f6224c..04dd6f47635 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -109,7 +109,7 @@ static int switch_gc_head(struct ubifs_info *c)
return err;
c->gc_lnum = -1;
- err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
+ err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0);
return err;
}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 9228950a658..e18b9889a51 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -109,13 +109,13 @@ int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
if (err && (err != -EBADMSG || even_ebadmsg)) {
ubifs_err("reading %d bytes from LEB %d:%d failed, error %d",
len, lnum, offs, err);
- dbg_dump_stack();
+ dump_stack();
}
return err;
}
int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
- int len, int dtype)
+ int len)
{
int err;
@@ -123,20 +123,19 @@ int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
if (c->ro_error)
return -EROFS;
if (!dbg_is_tst_rcvry(c))
- err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+ err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
else
- err = dbg_leb_write(c, lnum, buf, offs, len, dtype);
+ err = dbg_leb_write(c, lnum, buf, offs, len);
if (err) {
ubifs_err("writing %d bytes to LEB %d:%d failed, error %d",
len, lnum, offs, err);
ubifs_ro_mode(c, err);
- dbg_dump_stack();
+ dump_stack();
}
return err;
}
-int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
- int dtype)
+int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len)
{
int err;
@@ -144,14 +143,14 @@ int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
if (c->ro_error)
return -EROFS;
if (!dbg_is_tst_rcvry(c))
- err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+ err = ubi_leb_change(c->ubi, lnum, buf, len);
else
- err = dbg_leb_change(c, lnum, buf, len, dtype);
+ err = dbg_leb_change(c, lnum, buf, len);
if (err) {
ubifs_err("changing %d bytes in LEB %d failed, error %d",
len, lnum, err);
ubifs_ro_mode(c, err);
- dbg_dump_stack();
+ dump_stack();
}
return err;
}
@@ -170,12 +169,12 @@ int ubifs_leb_unmap(struct ubifs_info *c, int lnum)
if (err) {
ubifs_err("unmap LEB %d failed, error %d", lnum, err);
ubifs_ro_mode(c, err);
- dbg_dump_stack();
+ dump_stack();
}
return err;
}
-int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype)
+int ubifs_leb_map(struct ubifs_info *c, int lnum)
{
int err;
@@ -183,13 +182,13 @@ int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype)
if (c->ro_error)
return -EROFS;
if (!dbg_is_tst_rcvry(c))
- err = ubi_leb_map(c->ubi, lnum, dtype);
+ err = ubi_leb_map(c->ubi, lnum);
else
- err = dbg_leb_map(c, lnum, dtype);
+ err = dbg_leb_map(c, lnum);
if (err) {
ubifs_err("mapping LEB %d failed, error %d", lnum, err);
ubifs_ro_mode(c, err);
- dbg_dump_stack();
+ dump_stack();
}
return err;
}
@@ -202,7 +201,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
if (err < 0) {
ubifs_err("ubi_is_mapped failed for LEB %d, error %d",
lnum, err);
- dbg_dump_stack();
+ dump_stack();
}
return err;
}
@@ -294,8 +293,8 @@ out_len:
out:
if (!quiet) {
ubifs_err("bad node at LEB %d:%d", lnum, offs);
- dbg_dump_node(c, buf);
- dbg_dump_stack();
+ ubifs_dump_node(c, buf);
+ dump_stack();
}
return err;
}
@@ -523,8 +522,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
dirt = sync_len - wbuf->used;
if (dirt)
ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
- err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len,
- wbuf->dtype);
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len);
if (err)
return err;
@@ -562,14 +560,12 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
* @wbuf: write-buffer
* @lnum: logical eraseblock number to seek to
* @offs: logical eraseblock offset to seek to
- * @dtype: data type
*
* This function targets the write-buffer to logical eraseblock @lnum:@offs.
* The write-buffer has to be empty. Returns zero in case of success and a
* negative error code in case of failure.
*/
-int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
- int dtype)
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs)
{
const struct ubifs_info *c = wbuf->c;
@@ -592,7 +588,6 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
wbuf->avail = wbuf->size;
wbuf->used = 0;
spin_unlock(&wbuf->lock);
- wbuf->dtype = dtype;
return 0;
}
@@ -719,8 +714,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
dbg_io("flush jhead %s wbuf to LEB %d:%d",
dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf,
- wbuf->offs, wbuf->size,
- wbuf->dtype);
+ wbuf->offs, wbuf->size);
if (err)
goto out;
@@ -756,7 +750,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs,
- wbuf->size, wbuf->dtype);
+ wbuf->size);
if (err)
goto out;
@@ -775,7 +769,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
dbg_io("write %d bytes to LEB %d:%d",
wbuf->size, wbuf->lnum, wbuf->offs);
err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs,
- wbuf->size, wbuf->dtype);
+ wbuf->size);
if (err)
goto out;
@@ -797,7 +791,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
wbuf->offs);
err = ubifs_leb_write(c, wbuf->lnum, buf + written,
- wbuf->offs, n, wbuf->dtype);
+ wbuf->offs, n);
if (err)
goto out;
wbuf->offs += n;
@@ -841,9 +835,9 @@ exit:
out:
ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
len, wbuf->lnum, wbuf->offs, err);
- dbg_dump_node(c, buf);
- dbg_dump_stack();
- dbg_dump_leb(c, wbuf->lnum);
+ ubifs_dump_node(c, buf);
+ dump_stack();
+ ubifs_dump_leb(c, wbuf->lnum);
return err;
}
@@ -854,7 +848,6 @@ out:
* @len: node length
* @lnum: logical eraseblock number
* @offs: offset within the logical eraseblock
- * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
*
* This function automatically fills node magic number, assigns sequence
* number, and calculates node CRC checksum. The length of the @buf buffer has
@@ -863,7 +856,7 @@ out:
* success and a negative error code in case of failure.
*/
int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
- int offs, int dtype)
+ int offs)
{
int err, buf_len = ALIGN(len, c->min_io_size);
@@ -879,9 +872,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
return -EROFS;
ubifs_prepare_node(c, buf, len, 1);
- err = ubifs_leb_write(c, lnum, buf, offs, buf_len, dtype);
+ err = ubifs_leb_write(c, lnum, buf, offs, buf_len);
if (err)
- dbg_dump_node(c, buf);
+ ubifs_dump_node(c, buf);
return err;
}
@@ -960,8 +953,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
out:
ubifs_err("bad node at LEB %d:%d", lnum, offs);
- dbg_dump_node(c, buf);
- dbg_dump_stack();
+ ubifs_dump_node(c, buf);
+ dump_stack();
return -EINVAL;
}
@@ -1017,8 +1010,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
out:
ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
ubi_is_mapped(c->ubi, lnum));
- dbg_dump_node(c, buf);
- dbg_dump_stack();
+ ubifs_dump_node(c, buf);
+ dump_stack();
return -EINVAL;
}
@@ -1056,7 +1049,6 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
*/
size = c->max_write_size - (c->leb_start % c->max_write_size);
wbuf->avail = wbuf->size = size;
- wbuf->dtype = UBI_UNKNOWN;
wbuf->sync_callback = NULL;
mutex_init(&wbuf->io_mutex);
spin_lock_init(&wbuf->lock);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 2f438ab2e7a..12c0f154ca8 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -214,7 +214,7 @@ out:
err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
if (err)
goto out_return;
- err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
+ err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs);
if (err)
goto out_unlock;
@@ -385,9 +385,9 @@ out:
if (err == -ENOSPC) {
/* This are some budgeting problems, print useful information */
down_write(&c->commit_sem);
- dbg_dump_stack();
- dbg_dump_budg(c, &c->bi);
- dbg_dump_lprops(c);
+ dump_stack();
+ ubifs_dump_budg(c, &c->bi);
+ ubifs_dump_lprops(c);
cmt_retries = dbg_check_lprops(c);
up_write(&c->commit_sem);
}
@@ -1267,7 +1267,6 @@ out_free:
return err;
}
-#ifdef CONFIG_UBIFS_FS_XATTR
/**
* ubifs_jnl_delete_xattr - delete an extended attribute.
@@ -1462,4 +1461,3 @@ out_free:
return err;
}
-#endif /* CONFIG_UBIFS_FS_XATTR */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index f9fd068d1ae..c80b15d6c8d 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -29,11 +29,7 @@
#include "ubifs.h"
-#ifdef CONFIG_UBIFS_FS_DEBUG
static int dbg_check_bud_bytes(struct ubifs_info *c);
-#else
-#define dbg_check_bud_bytes(c) 0
-#endif
/**
* ubifs_search_bud - search bud LEB.
@@ -262,7 +258,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
* an unclean reboot, because the target LEB might have been
* unmapped, but not yet physically erased.
*/
- err = ubifs_leb_map(c, bud->lnum, UBI_SHORTTERM);
+ err = ubifs_leb_map(c, bud->lnum);
if (err)
goto out_unlock;
}
@@ -270,7 +266,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
dbg_log("write ref LEB %d:%d",
c->lhead_lnum, c->lhead_offs);
err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
- c->lhead_offs, UBI_SHORTTERM);
+ c->lhead_offs);
if (err)
goto out_unlock;
@@ -422,7 +418,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
len = ALIGN(len, c->min_io_size);
dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
- err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM);
+ err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len);
if (err)
goto out;
@@ -623,7 +619,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
int sz = ALIGN(*offs, c->min_io_size), err;
ubifs_pad(c, buf + *offs, sz - *offs);
- err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, *lnum, buf, sz);
if (err)
return err;
*lnum = ubifs_next_log_lnum(c, *lnum);
@@ -702,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
int sz = ALIGN(offs, c->min_io_size);
ubifs_pad(c, buf + offs, sz - offs);
- err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, write_lnum, buf, sz);
if (err)
goto out_free;
offs = ALIGN(offs, c->min_io_size);
@@ -734,8 +730,6 @@ out_free:
return err;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
/**
* dbg_check_bud_bytes - make sure bud bytes calculation are all right.
* @c: UBIFS file-system description object
@@ -767,5 +761,3 @@ static int dbg_check_bud_bytes(struct ubifs_info *c)
return err;
}
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index f8a181e647c..86eb8e53324 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -447,7 +447,7 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
int new_cat = ubifs_categorize_lprops(c, lprops);
if (old_cat == new_cat) {
- struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1];
+ struct ubifs_lpt_heap *heap;
/* lprops on a heap now must be moved up or down */
if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
@@ -846,7 +846,9 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c)
return lprops;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
+/*
+ * Everything below is related to debugging.
+ */
/**
* dbg_check_cats - check category heaps and lists.
@@ -1001,8 +1003,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
out:
if (err) {
dbg_msg("failed cat %d hpos %d err %d", cat, i, err);
- dbg_dump_stack();
- dbg_dump_heap(c, heap, cat);
+ dump_stack();
+ ubifs_dump_heap(c, heap, cat);
}
}
@@ -1109,8 +1111,8 @@ static int scan_check_cb(struct ubifs_info *c,
if (IS_ERR(sleb)) {
ret = PTR_ERR(sleb);
if (ret == -EUCLEAN) {
- dbg_dump_lprops(c);
- dbg_dump_budg(c, &c->bi);
+ ubifs_dump_lprops(c);
+ ubifs_dump_budg(c, &c->bi);
}
goto out;
}
@@ -1237,7 +1239,7 @@ out_print:
ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
"should be free %d, dirty %d",
lnum, lp->free, lp->dirty, lp->flags, free, dirty);
- dbg_dump_leb(c, lnum);
+ ubifs_dump_leb(c, lnum);
out_destroy:
ubifs_scan_destroy(sleb);
ret = -EINVAL;
@@ -1315,5 +1317,3 @@ int dbg_check_lprops(struct ubifs_info *c)
out:
return err;
}
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 66d59d0a140..ce33b2beb15 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -701,8 +701,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
alen = ALIGN(len, c->min_io_size);
set_ltab(c, lnum, c->leb_size - alen, alen - len);
memset(p, 0xff, alen - len);
- err = ubifs_leb_change(c, lnum++, buf, alen,
- UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum++, buf, alen);
if (err)
goto out;
p = buf;
@@ -732,8 +731,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
set_ltab(c, lnum, c->leb_size - alen,
alen - len);
memset(p, 0xff, alen - len);
- err = ubifs_leb_change(c, lnum++, buf, alen,
- UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum++, buf, alen);
if (err)
goto out;
p = buf;
@@ -780,8 +778,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
alen = ALIGN(len, c->min_io_size);
set_ltab(c, lnum, c->leb_size - alen, alen - len);
memset(p, 0xff, alen - len);
- err = ubifs_leb_change(c, lnum++, buf, alen,
- UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum++, buf, alen);
if (err)
goto out;
p = buf;
@@ -806,7 +803,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
alen = ALIGN(len, c->min_io_size);
set_ltab(c, lnum, c->leb_size - alen, alen - len);
memset(p, 0xff, alen - len);
- err = ubifs_leb_change(c, lnum++, buf, alen, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum++, buf, alen);
if (err)
goto out;
p = buf;
@@ -826,7 +823,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
/* Write remaining buffer */
memset(p, 0xff, alen - len);
- err = ubifs_leb_change(c, lnum, buf, alen, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum, buf, alen);
if (err)
goto out;
@@ -926,7 +923,7 @@ static int check_lpt_crc(void *buf, int len)
if (crc != calc_crc) {
ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
calc_crc);
- dbg_dump_stack();
+ dump_stack();
return -EINVAL;
}
return 0;
@@ -949,7 +946,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type)
if (node_type != type) {
ubifs_err("invalid type (%d) in LPT node type %d", node_type,
type);
- dbg_dump_stack();
+ dump_stack();
return -EINVAL;
}
return 0;
@@ -1247,7 +1244,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
out:
ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
- dbg_dump_stack();
+ dump_stack();
kfree(nnode);
return err;
}
@@ -1312,8 +1309,8 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
out:
ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
- dbg_dump_pnode(c, pnode, parent, iip);
- dbg_dump_stack();
+ ubifs_dump_pnode(c, pnode, parent, iip);
+ dump_stack();
dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
kfree(pnode);
return err;
@@ -1740,16 +1737,20 @@ int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
if (rd) {
err = lpt_init_rd(c);
if (err)
- return err;
+ goto out_err;
}
if (wr) {
err = lpt_init_wr(c);
if (err)
- return err;
+ goto out_err;
}
return 0;
+
+out_err:
+ ubifs_lpt_free(c, 0);
+ return err;
}
/**
@@ -2080,8 +2081,6 @@ out:
return err;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
/**
* dbg_chk_pnode - check a pnode.
* @c: the UBIFS file-system description object
@@ -2096,8 +2095,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
int i;
if (pnode->num != col) {
- dbg_err("pnode num %d expected %d parent num %d iip %d",
- pnode->num, col, pnode->parent->num, pnode->iip);
+ ubifs_err("pnode num %d expected %d parent num %d iip %d",
+ pnode->num, col, pnode->parent->num, pnode->iip);
return -EINVAL;
}
for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
@@ -2111,14 +2110,14 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
if (lnum >= c->leb_cnt)
continue;
if (lprops->lnum != lnum) {
- dbg_err("bad LEB number %d expected %d",
- lprops->lnum, lnum);
+ ubifs_err("bad LEB number %d expected %d",
+ lprops->lnum, lnum);
return -EINVAL;
}
if (lprops->flags & LPROPS_TAKEN) {
if (cat != LPROPS_UNCAT) {
- dbg_err("LEB %d taken but not uncat %d",
- lprops->lnum, cat);
+ ubifs_err("LEB %d taken but not uncat %d",
+ lprops->lnum, cat);
return -EINVAL;
}
continue;
@@ -2130,8 +2129,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
case LPROPS_FRDI_IDX:
break;
default:
- dbg_err("LEB %d index but cat %d",
- lprops->lnum, cat);
+ ubifs_err("LEB %d index but cat %d",
+ lprops->lnum, cat);
return -EINVAL;
}
} else {
@@ -2143,8 +2142,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
case LPROPS_FREEABLE:
break;
default:
- dbg_err("LEB %d not index but cat %d",
- lprops->lnum, cat);
+ ubifs_err("LEB %d not index but cat %d",
+ lprops->lnum, cat);
return -EINVAL;
}
}
@@ -2184,24 +2183,24 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
break;
}
if (!found) {
- dbg_err("LEB %d cat %d not found in cat heap/list",
- lprops->lnum, cat);
+ ubifs_err("LEB %d cat %d not found in cat heap/list",
+ lprops->lnum, cat);
return -EINVAL;
}
switch (cat) {
case LPROPS_EMPTY:
if (lprops->free != c->leb_size) {
- dbg_err("LEB %d cat %d free %d dirty %d",
- lprops->lnum, cat, lprops->free,
- lprops->dirty);
+ ubifs_err("LEB %d cat %d free %d dirty %d",
+ lprops->lnum, cat, lprops->free,
+ lprops->dirty);
return -EINVAL;
}
case LPROPS_FREEABLE:
case LPROPS_FRDI_IDX:
if (lprops->free + lprops->dirty != c->leb_size) {
- dbg_err("LEB %d cat %d free %d dirty %d",
- lprops->lnum, cat, lprops->free,
- lprops->dirty);
+ ubifs_err("LEB %d cat %d free %d dirty %d",
+ lprops->lnum, cat, lprops->free,
+ lprops->dirty);
return -EINVAL;
}
}
@@ -2235,9 +2234,10 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
/* cnode is a nnode */
num = calc_nnode_num(row, col);
if (cnode->num != num) {
- dbg_err("nnode num %d expected %d "
- "parent num %d iip %d", cnode->num, num,
- (nnode ? nnode->num : 0), cnode->iip);
+ ubifs_err("nnode num %d expected %d "
+ "parent num %d iip %d",
+ cnode->num, num,
+ (nnode ? nnode->num : 0), cnode->iip);
return -EINVAL;
}
nn = (struct ubifs_nnode *)cnode;
@@ -2274,5 +2274,3 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
}
return 0;
}
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index cddd6bd214f..4fa70734e6e 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -30,11 +30,7 @@
#include <linux/random.h>
#include "ubifs.h"
-#ifdef CONFIG_UBIFS_FS_DEBUG
static int dbg_populate_lsave(struct ubifs_info *c);
-#else
-#define dbg_populate_lsave(c) 0
-#endif
/**
* first_dirty_cnode - find first dirty cnode.
@@ -324,11 +320,10 @@ static int layout_cnodes(struct ubifs_info *c)
return 0;
no_space:
- ubifs_err("LPT out of space");
- dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
- "done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
- dbg_dump_lpt_info(c);
- dbg_dump_lpt_lebs(c);
+ ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
+ "done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+ ubifs_dump_lpt_info(c);
+ ubifs_dump_lpt_lebs(c);
dump_stack();
return err;
}
@@ -421,7 +416,7 @@ static int write_cnodes(struct ubifs_info *c)
alen = ALIGN(wlen, c->min_io_size);
memset(buf + offs, 0xff, alen - wlen);
err = ubifs_leb_write(c, lnum, buf + from, from,
- alen, UBI_SHORTTERM);
+ alen);
if (err)
return err;
}
@@ -479,8 +474,7 @@ static int write_cnodes(struct ubifs_info *c)
wlen = offs - from;
alen = ALIGN(wlen, c->min_io_size);
memset(buf + offs, 0xff, alen - wlen);
- err = ubifs_leb_write(c, lnum, buf + from, from, alen,
- UBI_SHORTTERM);
+ err = ubifs_leb_write(c, lnum, buf + from, from, alen);
if (err)
return err;
dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
@@ -506,8 +500,7 @@ static int write_cnodes(struct ubifs_info *c)
wlen = offs - from;
alen = ALIGN(wlen, c->min_io_size);
memset(buf + offs, 0xff, alen - wlen);
- err = ubifs_leb_write(c, lnum, buf + from, from, alen,
- UBI_SHORTTERM);
+ err = ubifs_leb_write(c, lnum, buf + from, from, alen);
if (err)
return err;
dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
@@ -531,7 +524,7 @@ static int write_cnodes(struct ubifs_info *c)
wlen = offs - from;
alen = ALIGN(wlen, c->min_io_size);
memset(buf + offs, 0xff, alen - wlen);
- err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
+ err = ubifs_leb_write(c, lnum, buf + from, from, alen);
if (err)
return err;
@@ -552,11 +545,10 @@ static int write_cnodes(struct ubifs_info *c)
return 0;
no_space:
- ubifs_err("LPT out of space mismatch");
- dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
- "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
- dbg_dump_lpt_info(c);
- dbg_dump_lpt_lebs(c);
+ ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
+ "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+ ubifs_dump_lpt_info(c);
+ ubifs_dump_lpt_lebs(c);
dump_stack();
return err;
}
@@ -1497,7 +1489,9 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
kfree(c->lpt_nod_buf);
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
+/*
+ * Everything below is related to debugging.
+ */
/**
* dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
@@ -1735,7 +1729,7 @@ int dbg_check_ltab(struct ubifs_info *c)
for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
err = dbg_check_ltab_lnum(c, lnum);
if (err) {
- dbg_err("failed at LEB %d", lnum);
+ ubifs_err("failed at LEB %d", lnum);
return err;
}
}
@@ -1767,10 +1761,10 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
free += c->leb_size;
}
if (free < c->lpt_sz) {
- dbg_err("LPT space error: free %lld lpt_sz %lld",
- free, c->lpt_sz);
- dbg_dump_lpt_info(c);
- dbg_dump_lpt_lebs(c);
+ ubifs_err("LPT space error: free %lld lpt_sz %lld",
+ free, c->lpt_sz);
+ ubifs_dump_lpt_info(c);
+ ubifs_dump_lpt_lebs(c);
dump_stack();
return -EINVAL;
}
@@ -1807,13 +1801,13 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
d->chk_lpt_lebs = 0;
d->chk_lpt_wastage = 0;
if (c->dirty_pn_cnt > c->pnode_cnt) {
- dbg_err("dirty pnodes %d exceed max %d",
- c->dirty_pn_cnt, c->pnode_cnt);
+ ubifs_err("dirty pnodes %d exceed max %d",
+ c->dirty_pn_cnt, c->pnode_cnt);
err = -EINVAL;
}
if (c->dirty_nn_cnt > c->nnode_cnt) {
- dbg_err("dirty nnodes %d exceed max %d",
- c->dirty_nn_cnt, c->nnode_cnt);
+ ubifs_err("dirty nnodes %d exceed max %d",
+ c->dirty_nn_cnt, c->nnode_cnt);
err = -EINVAL;
}
return err;
@@ -1830,23 +1824,23 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
chk_lpt_sz *= d->chk_lpt_lebs;
chk_lpt_sz += len - c->nhead_offs;
if (d->chk_lpt_sz != chk_lpt_sz) {
- dbg_err("LPT wrote %lld but space used was %lld",
- d->chk_lpt_sz, chk_lpt_sz);
+ ubifs_err("LPT wrote %lld but space used was %lld",
+ d->chk_lpt_sz, chk_lpt_sz);
err = -EINVAL;
}
if (d->chk_lpt_sz > c->lpt_sz) {
- dbg_err("LPT wrote %lld but lpt_sz is %lld",
- d->chk_lpt_sz, c->lpt_sz);
+ ubifs_err("LPT wrote %lld but lpt_sz is %lld",
+ d->chk_lpt_sz, c->lpt_sz);
err = -EINVAL;
}
if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
- dbg_err("LPT layout size %lld but wrote %lld",
- d->chk_lpt_sz, d->chk_lpt_sz2);
+ ubifs_err("LPT layout size %lld but wrote %lld",
+ d->chk_lpt_sz, d->chk_lpt_sz2);
err = -EINVAL;
}
if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
- dbg_err("LPT new nhead offs: expected %d was %d",
- d->new_nhead_offs, len);
+ ubifs_err("LPT new nhead offs: expected %d was %d",
+ d->new_nhead_offs, len);
err = -EINVAL;
}
lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
@@ -1855,13 +1849,13 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
if (c->big_lpt)
lpt_sz += c->lsave_sz;
if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
- dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
- d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
+ ubifs_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
+ d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
err = -EINVAL;
}
if (err) {
- dbg_dump_lpt_info(c);
- dbg_dump_lpt_lebs(c);
+ ubifs_dump_lpt_info(c);
+ ubifs_dump_lpt_lebs(c);
dump_stack();
}
d->chk_lpt_sz2 = d->chk_lpt_sz;
@@ -1880,7 +1874,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
}
/**
- * dbg_dump_lpt_leb - dump an LPT LEB.
+ * ubifs_dump_lpt_leb - dump an LPT LEB.
* @c: UBIFS file-system description object
* @lnum: LEB number to dump
*
@@ -1986,13 +1980,13 @@ out:
}
/**
- * dbg_dump_lpt_lebs - dump LPT lebs.
+ * ubifs_dump_lpt_lebs - dump LPT lebs.
* @c: UBIFS file-system description object
*
* This function dumps all LPT LEBs. The caller has to make sure the LPT is
* locked.
*/
-void dbg_dump_lpt_lebs(const struct ubifs_info *c)
+void ubifs_dump_lpt_lebs(const struct ubifs_info *c)
{
int i;
@@ -2046,5 +2040,3 @@ static int dbg_populate_lsave(struct ubifs_info *c)
return 1;
}
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 278c2382e8c..ab83ace9910 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -241,7 +241,7 @@ static int validate_master(const struct ubifs_info *c)
out:
ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
- dbg_dump_node(c, c->mst_node);
+ ubifs_dump_node(c, c->mst_node);
return -EINVAL;
}
@@ -317,7 +317,7 @@ int ubifs_read_master(struct ubifs_info *c)
if (c->leb_cnt < old_leb_cnt ||
c->leb_cnt < UBIFS_MIN_LEB_CNT) {
ubifs_err("bad leb_cnt on master node");
- dbg_dump_node(c, c->mst_node);
+ ubifs_dump_node(c, c->mst_node);
return -EINVAL;
}
@@ -379,7 +379,7 @@ int ubifs_write_master(struct ubifs_info *c)
c->mst_offs = offs;
c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
- err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+ err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
if (err)
return err;
@@ -390,7 +390,7 @@ int ubifs_write_master(struct ubifs_info *c)
if (err)
return err;
}
- err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+ err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
return err;
}
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index c542c73cfa3..b02734db187 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -52,11 +52,7 @@
* than the maximum number of orphans allowed.
*/
-#ifdef CONFIG_UBIFS_FS_DEBUG
static int dbg_check_orphans(struct ubifs_info *c);
-#else
-#define dbg_check_orphans(c) 0
-#endif
/**
* ubifs_add_orphan - add an orphan.
@@ -92,7 +88,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
else if (inum > o->inum)
p = &(*p)->rb_right;
else {
- dbg_err("orphaned twice");
+ ubifs_err("orphaned twice");
spin_unlock(&c->orphan_lock);
kfree(orphan);
return 0;
@@ -158,8 +154,8 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
}
}
spin_unlock(&c->orphan_lock);
- dbg_err("missing orphan ino %lu", (unsigned long)inum);
- dbg_dump_stack();
+ ubifs_err("missing orphan ino %lu", (unsigned long)inum);
+ dump_stack();
}
/**
@@ -248,8 +244,7 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
ubifs_assert(c->ohead_offs == 0);
ubifs_prepare_node(c, c->orph_buf, len, 1);
len = ALIGN(len, c->min_io_size);
- err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
- UBI_SHORTTERM);
+ err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len);
} else {
if (c->ohead_offs == 0) {
/* Ensure LEB has been unmapped */
@@ -258,7 +253,7 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
return err;
}
err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
- c->ohead_offs, UBI_SHORTTERM);
+ c->ohead_offs);
}
return err;
}
@@ -569,7 +564,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
if (snod->type != UBIFS_ORPH_NODE) {
ubifs_err("invalid node type %d in orphan area at "
"%d:%d", snod->type, sleb->lnum, snod->offs);
- dbg_dump_node(c, snod->node);
+ ubifs_dump_node(c, snod->node);
return -EINVAL;
}
@@ -597,7 +592,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
ubifs_err("out of order commit number %llu in "
"orphan node at %d:%d",
cmt_no, sleb->lnum, snod->offs);
- dbg_dump_node(c, snod->node);
+ ubifs_dump_node(c, snod->node);
return -EINVAL;
}
dbg_rcvry("out of date LEB %d", sleb->lnum);
@@ -725,7 +720,9 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
return err;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
+/*
+ * Everything below is related to debugging.
+ */
struct check_orphan {
struct rb_node rb;
@@ -968,5 +965,3 @@ out:
kfree(ci.node);
return err;
}
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 2a935b31723..c30d976b4be 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -213,10 +213,10 @@ static int write_rcvrd_mst_node(struct ubifs_info *c,
mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
- err = ubifs_leb_change(c, lnum, mst, sz, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum, mst, sz);
if (err)
goto out;
- err = ubifs_leb_change(c, lnum + 1, mst, sz, UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum + 1, mst, sz);
if (err)
goto out;
out:
@@ -362,12 +362,12 @@ out_err:
out_free:
ubifs_err("failed to recover master node");
if (mst1) {
- dbg_err("dumping first master node");
- dbg_dump_node(c, mst1);
+ ubifs_err("dumping first master node");
+ ubifs_dump_node(c, mst1);
}
if (mst2) {
- dbg_err("dumping second master node");
- dbg_dump_node(c, mst2);
+ ubifs_err("dumping second master node");
+ ubifs_dump_node(c, mst2);
}
vfree(buf2);
vfree(buf1);
@@ -555,8 +555,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
ubifs_pad(c, buf, pad_len);
}
}
- err = ubifs_leb_change(c, lnum, sleb->buf, len,
- UBI_UNKNOWN);
+ err = ubifs_leb_change(c, lnum, sleb->buf, len);
if (err)
return err;
}
@@ -683,7 +682,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
ret, lnum, offs);
break;
} else {
- dbg_err("unexpected return value %d", ret);
+ ubifs_err("unexpected return value %d", ret);
err = -EINVAL;
goto error;
}
@@ -789,7 +788,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
corrupted_rescan:
/* Re-scan the corrupted data with verbose messages */
- dbg_err("corruptio %d", ret);
+ ubifs_err("corruptio %d", ret);
ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
corrupted:
ubifs_scanned_corruption(c, lnum, offs, buf);
@@ -827,17 +826,17 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
goto out_free;
ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
if (ret != SCANNED_A_NODE) {
- dbg_err("Not a valid node");
+ ubifs_err("Not a valid node");
goto out_err;
}
if (cs_node->ch.node_type != UBIFS_CS_NODE) {
- dbg_err("Node a CS node, type is %d", cs_node->ch.node_type);
+ ubifs_err("Node a CS node, type is %d", cs_node->ch.node_type);
goto out_err;
}
if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
- dbg_err("CS node cmt_no %llu != current cmt_no %llu",
- (unsigned long long)le64_to_cpu(cs_node->cmt_no),
- c->cmt_no);
+ ubifs_err("CS node cmt_no %llu != current cmt_no %llu",
+ (unsigned long long)le64_to_cpu(cs_node->cmt_no),
+ c->cmt_no);
goto out_err;
}
*cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
@@ -941,7 +940,7 @@ static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf)
err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1);
if (err)
return err;
- return ubifs_leb_change(c, lnum, sbuf, offs, UBI_UNKNOWN);
+ return ubifs_leb_change(c, lnum, sbuf, offs);
}
return 0;
@@ -1071,7 +1070,7 @@ static int clean_an_unclean_leb(struct ubifs_info *c,
}
/* Write back the LEB atomically */
- err = ubifs_leb_change(c, lnum, sbuf, len, UBI_UNKNOWN);
+ err = ubifs_leb_change(c, lnum, sbuf, len);
if (err)
return err;
@@ -1138,9 +1137,9 @@ static int grab_empty_leb(struct ubifs_info *c)
*/
lnum = ubifs_find_free_leb_for_idx(c);
if (lnum < 0) {
- dbg_err("could not find an empty LEB");
- dbg_dump_lprops(c);
- dbg_dump_budg(c, &c->bi);
+ ubifs_err("could not find an empty LEB");
+ ubifs_dump_lprops(c);
+ ubifs_dump_budg(c, &c->bi);
return lnum;
}
@@ -1218,7 +1217,7 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
}
mutex_unlock(&wbuf->io_mutex);
if (err < 0) {
- dbg_err("GC failed, error %d", err);
+ ubifs_err("GC failed, error %d", err);
if (err == -EAGAIN)
err = -EINVAL;
return err;
@@ -1472,7 +1471,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
len -= 1;
len = ALIGN(len + 1, c->min_io_size);
/* Atomically write the fixed LEB back again */
- err = ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN);
+ err = ubifs_leb_change(c, lnum, c->sbuf, len);
if (err)
goto out;
dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index b007637f040..3a2da7e476e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -154,8 +154,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
/* Make sure the journal head points to the latest bud */
err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
- b->bud->lnum, c->leb_size - b->free,
- UBI_SHORTTERM);
+ b->bud->lnum, c->leb_size - b->free);
out:
ubifs_release_lprops(c);
@@ -686,7 +685,7 @@ out:
out_dump:
ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
- dbg_dump_node(c, snod->node);
+ ubifs_dump_node(c, snod->node);
ubifs_scan_destroy(sleb);
return -EINVAL;
}
@@ -861,16 +860,16 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
* numbers.
*/
if (snod->type != UBIFS_CS_NODE) {
- dbg_err("first log node at LEB %d:%d is not CS node",
- lnum, offs);
+ ubifs_err("first log node at LEB %d:%d is not CS node",
+ lnum, offs);
goto out_dump;
}
if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
- dbg_err("first CS node at LEB %d:%d has wrong "
- "commit number %llu expected %llu",
- lnum, offs,
- (unsigned long long)le64_to_cpu(node->cmt_no),
- c->cmt_no);
+ ubifs_err("first CS node at LEB %d:%d has wrong "
+ "commit number %llu expected %llu",
+ lnum, offs,
+ (unsigned long long)le64_to_cpu(node->cmt_no),
+ c->cmt_no);
goto out_dump;
}
@@ -892,7 +891,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
/* Make sure the first node sits at offset zero of the LEB */
if (snod->offs != 0) {
- dbg_err("first node is not at zero offset");
+ ubifs_err("first node is not at zero offset");
goto out_dump;
}
@@ -905,8 +904,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
}
if (snod->sqnum < c->cs_sqnum) {
- dbg_err("bad sqnum %llu, commit sqnum %llu",
- snod->sqnum, c->cs_sqnum);
+ ubifs_err("bad sqnum %llu, commit sqnum %llu",
+ snod->sqnum, c->cs_sqnum);
goto out_dump;
}
@@ -958,7 +957,7 @@ out:
out_dump:
ubifs_err("log error detected while replaying the log at LEB %d:%d",
lnum, offs + snod->offs);
- dbg_dump_node(c, snod->node);
+ ubifs_dump_node(c, snod->node);
ubifs_scan_destroy(sleb);
return -EINVAL;
}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 771f7fb6ce9..ef3d1ba6d99 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -130,7 +130,6 @@ static int create_default_filesystem(struct ubifs_info *c)
* orphan node.
*/
orph_lebs = UBIFS_MIN_ORPH_LEBS;
-#ifdef CONFIG_UBIFS_FS_DEBUG
if (c->leb_cnt - min_leb_cnt > 1)
/*
* For debugging purposes it is better to have at least 2
@@ -138,7 +137,6 @@ static int create_default_filesystem(struct ubifs_info *c)
* consolidations and would be stressed more.
*/
orph_lebs += 1;
-#endif
main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
main_lebs -= orph_lebs;
@@ -196,7 +194,7 @@ static int create_default_filesystem(struct ubifs_info *c)
sup->rp_size = cpu_to_le64(tmp64);
sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
- err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
+ err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0);
kfree(sup);
if (err)
return err;
@@ -252,14 +250,13 @@ static int create_default_filesystem(struct ubifs_info *c)
mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
- err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
- UBI_UNKNOWN);
+ err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0);
if (err) {
kfree(mst);
return err;
}
- err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,
- UBI_UNKNOWN);
+ err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1,
+ 0);
kfree(mst);
if (err)
return err;
@@ -282,8 +279,7 @@ static int create_default_filesystem(struct ubifs_info *c)
key_write_idx(c, &key, &br->key);
br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
br->len = cpu_to_le32(UBIFS_INO_NODE_SZ);
- err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,
- UBI_UNKNOWN);
+ err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0);
kfree(idx);
if (err)
return err;
@@ -315,8 +311,7 @@ static int create_default_filesystem(struct ubifs_info *c)
ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
- main_first + DEFAULT_DATA_LEB, 0,
- UBI_UNKNOWN);
+ main_first + DEFAULT_DATA_LEB, 0);
kfree(ino);
if (err)
return err;
@@ -335,8 +330,7 @@ static int create_default_filesystem(struct ubifs_info *c)
return -ENOMEM;
cs->ch.node_type = UBIFS_CS_NODE;
- err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,
- 0, UBI_UNKNOWN);
+ err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
kfree(cs);
ubifs_msg("default file-system created");
@@ -475,7 +469,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
failed:
ubifs_err("bad superblock, error %d", err);
- dbg_dump_node(c, sup);
+ ubifs_dump_node(c, sup);
return -EINVAL;
}
@@ -518,7 +512,7 @@ int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
- return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM);
+ return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len);
}
/**
@@ -691,7 +685,7 @@ static int fixup_leb(struct ubifs_info *c, int lnum, int len)
if (err)
return err;
- return ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN);
+ return ubifs_leb_change(c, lnum, c->sbuf, len);
}
/**
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 37383e8011b..7c40e6025fd 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -101,7 +101,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
if (!quiet) {
ubifs_err("bad pad node at LEB %d:%d",
lnum, offs);
- dbg_dump_node(c, pad);
+ ubifs_dump_node(c, pad);
}
return SCANNED_A_BAD_PAD_NODE;
}
@@ -109,8 +109,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
/* Make the node pads to 8-byte boundary */
if ((node_len + pad_len) & 7) {
if (!quiet)
- dbg_err("bad padding length %d - %d",
- offs, offs + node_len + pad_len);
+ ubifs_err("bad padding length %d - %d",
+ offs, offs + node_len + pad_len);
return SCANNED_A_BAD_PAD_NODE;
}
@@ -245,7 +245,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
len = c->leb_size - offs;
if (len > 8192)
len = 8192;
- dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
+ ubifs_err("first %d bytes from LEB %d:%d", len, lnum, offs);
print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
}
@@ -300,16 +300,16 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
switch (ret) {
case SCANNED_GARBAGE:
- dbg_err("garbage");
+ ubifs_err("garbage");
goto corrupted;
case SCANNED_A_NODE:
break;
case SCANNED_A_CORRUPT_NODE:
case SCANNED_A_BAD_PAD_NODE:
- dbg_err("bad node");
+ ubifs_err("bad node");
goto corrupted;
default:
- dbg_err("unknown");
+ ubifs_err("unknown");
err = -EINVAL;
goto error;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 76e4e0566ad..001acccac0d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -246,8 +246,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
out_invalid:
ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
- dbg_dump_node(c, ino);
- dbg_dump_inode(c, inode);
+ ubifs_dump_node(c, ino);
+ ubifs_dump_inode(c, inode);
err = -EINVAL;
out_ino:
kfree(ino);
@@ -668,8 +668,8 @@ static int init_constants_sb(struct ubifs_info *c)
tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
tmp = ALIGN(tmp, c->min_io_size);
if (tmp > c->leb_size) {
- dbg_err("too small LEB size %d, at least %d needed",
- c->leb_size, tmp);
+ ubifs_err("too small LEB size %d, at least %d needed",
+ c->leb_size, tmp);
return -EINVAL;
}
@@ -683,8 +683,8 @@ static int init_constants_sb(struct ubifs_info *c)
tmp /= c->leb_size;
tmp += 1;
if (c->log_lebs < tmp) {
- dbg_err("too small log %d LEBs, required min. %d LEBs",
- c->log_lebs, tmp);
+ ubifs_err("too small log %d LEBs, required min. %d LEBs",
+ c->log_lebs, tmp);
return -EINVAL;
}
@@ -813,13 +813,10 @@ static int alloc_wbufs(struct ubifs_info *c)
c->jheads[i].grouped = 1;
}
- c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
/*
- * Garbage Collector head likely contains long-term data and
- * does not need to be synchronized by timer. Also GC head nodes are
- * not grouped.
+ * Garbage Collector head does not need to be synchronized by timer.
+ * Also GC head nodes are not grouped.
*/
- c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
c->jheads[GCHD].wbuf.no_timer = 1;
c->jheads[GCHD].grouped = 0;
@@ -863,7 +860,7 @@ static void free_orphans(struct ubifs_info *c)
orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
list_del(&orph->list);
kfree(orph);
- dbg_err("orphan list not empty at unmount");
+ ubifs_err("orphan list not empty at unmount");
}
vfree(c->orph_buf);
@@ -1147,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c)
ubifs_assert(c->dark_wm > 0);
if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
ubifs_err("insufficient free space to mount in R/W mode");
- dbg_dump_budg(c, &c->bi);
- dbg_dump_lprops(c);
+ ubifs_dump_budg(c, &c->bi);
+ ubifs_dump_lprops(c);
return -ENOSPC;
}
return 0;
@@ -1301,7 +1298,7 @@ static int mount_ubifs(struct ubifs_info *c)
if (!c->ro_mount && c->space_fixup) {
err = ubifs_fixup_free_space(c);
if (err)
- goto out_master;
+ goto out_lpt;
}
if (!c->ro_mount) {
@@ -2126,8 +2123,8 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
*/
ubi = open_ubi(name, UBI_READONLY);
if (IS_ERR(ubi)) {
- dbg_err("cannot open \"%s\", error %d",
- name, (int)PTR_ERR(ubi));
+ ubifs_err("cannot open \"%s\", error %d",
+ name, (int)PTR_ERR(ubi));
return ERR_CAST(ubi);
}
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 16ad84d8402..349f31a30f4 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -339,8 +339,8 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
err = ubifs_validate_entry(c, dent);
if (err) {
- dbg_dump_stack();
- dbg_dump_node(c, dent);
+ dump_stack();
+ ubifs_dump_node(c, dent);
return err;
}
@@ -372,8 +372,8 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
err = ubifs_validate_entry(c, node);
if (err) {
- dbg_dump_stack();
- dbg_dump_node(c, node);
+ dump_stack();
+ ubifs_dump_node(c, node);
return err;
}
@@ -1733,8 +1733,8 @@ out_err:
err = -EINVAL;
out:
ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs);
- dbg_dump_node(c, buf);
- dbg_dump_stack();
+ ubifs_dump_node(c, buf);
+ dump_stack();
return err;
}
@@ -1775,7 +1775,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
if (err && err != -EBADMSG) {
ubifs_err("failed to read from LEB %d:%d, error %d",
lnum, offs, err);
- dbg_dump_stack();
+ dump_stack();
dbg_tnck(&bu->key, "key ");
return err;
}
@@ -2361,7 +2361,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
* by passing 'ubifs_tnc_remove_nm()' the same key but
* an unmatchable name.
*/
- struct qstr noname = { .len = 0, .name = "" };
+ struct qstr noname = { .name = "" };
err = dbg_check_tnc(c, 0);
mutex_unlock(&c->tnc_mutex);
@@ -2403,7 +2403,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
if (err) {
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
return err;
}
@@ -2649,7 +2649,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
znode->zbranch[i].len);
if (err) {
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
goto out_unlock;
}
dbg_tnck(key, "removing key ");
@@ -3275,8 +3275,6 @@ out_unlock:
return err;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
/**
* dbg_check_inode_size - check if inode size is correct.
* @c: UBIFS file-system description object
@@ -3335,13 +3333,11 @@ out_dump:
(unsigned long)inode->i_ino, size,
((loff_t)block) << UBIFS_BLOCK_SHIFT);
mutex_unlock(&c->tnc_mutex);
- dbg_dump_inode(c, inode);
- dbg_dump_stack();
+ ubifs_dump_inode(c, inode);
+ dump_stack();
return -EINVAL;
out_unlock:
mutex_unlock(&c->tnc_mutex);
return err;
}
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 4c15f07a8bb..523bbad69c0 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -54,18 +54,16 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
br->len = cpu_to_le32(zbr->len);
if (!zbr->lnum || !zbr->len) {
ubifs_err("bad ref in znode");
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
if (zbr->znode)
- dbg_dump_znode(c, zbr->znode);
+ ubifs_dump_znode(c, zbr->znode);
}
}
ubifs_prepare_node(c, idx, len, 0);
-#ifdef CONFIG_UBIFS_FS_DEBUG
znode->lnum = lnum;
znode->offs = offs;
znode->len = len;
-#endif
err = insert_old_idx_znode(c, znode);
@@ -322,8 +320,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
0, 0, 0);
if (err)
return err;
- err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len,
- UBI_SHORTTERM);
+ err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len);
if (err)
return err;
dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
@@ -388,8 +385,8 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
* option which forces in-the-gaps is enabled.
*/
ubifs_warn("out of space");
- dbg_dump_budg(c, &c->bi);
- dbg_dump_lprops(c);
+ ubifs_dump_budg(c, &c->bi);
+ ubifs_dump_lprops(c);
}
/* Try to commit anyway */
err = 0;
@@ -456,11 +453,9 @@ static int layout_in_empty_space(struct ubifs_info *c)
offs = buf_offs + used;
-#ifdef CONFIG_UBIFS_FS_DEBUG
znode->lnum = lnum;
znode->offs = offs;
znode->len = len;
-#endif
/* Update the parent */
zp = znode->parent;
@@ -536,10 +531,8 @@ static int layout_in_empty_space(struct ubifs_info *c)
break;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
c->dbg->new_ihead_lnum = lnum;
c->dbg->new_ihead_offs = buf_offs;
-#endif
return 0;
}
@@ -864,9 +857,9 @@ static int write_index(struct ubifs_info *c)
br->len = cpu_to_le32(zbr->len);
if (!zbr->lnum || !zbr->len) {
ubifs_err("bad ref in znode");
- dbg_dump_znode(c, znode);
+ ubifs_dump_znode(c, znode);
if (zbr->znode)
- dbg_dump_znode(c, zbr->znode);
+ ubifs_dump_znode(c, zbr->znode);
}
}
len = ubifs_idx_node_sz(c, znode->child_cnt);
@@ -881,13 +874,11 @@ static int write_index(struct ubifs_info *c)
}
offs = buf_offs + used;
-#ifdef CONFIG_UBIFS_FS_DEBUG
if (lnum != znode->lnum || offs != znode->offs ||
len != znode->len) {
ubifs_err("inconsistent znode posn");
return -EINVAL;
}
-#endif
/* Grab some stuff from znode while we still can */
cnext = znode->cnext;
@@ -959,8 +950,7 @@ static int write_index(struct ubifs_info *c)
}
/* The buffer is full or there are no more znodes to do */
- err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen,
- UBI_SHORTTERM);
+ err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen);
if (err)
return err;
buf_offs += blen;
@@ -982,13 +972,11 @@ static int write_index(struct ubifs_info *c)
break;
}
-#ifdef CONFIG_UBIFS_FS_DEBUG
if (lnum != c->dbg->new_ihead_lnum ||
buf_offs != c->dbg->new_ihead_offs) {
ubifs_err("inconsistent ihead");
return -EINVAL;
}
-#endif
c->ihead_lnum = lnum;
c->ihead_offs = buf_offs;
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index dc28fe6ec07..d38ac7f9654 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -293,10 +293,10 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
lnum, offs, znode->level, znode->child_cnt);
if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
- dbg_err("current fanout %d, branch count %d",
- c->fanout, znode->child_cnt);
- dbg_err("max levels %d, znode level %d",
- UBIFS_MAX_LEVELS, znode->level);
+ ubifs_err("current fanout %d, branch count %d",
+ c->fanout, znode->child_cnt);
+ ubifs_err("max levels %d, znode level %d",
+ UBIFS_MAX_LEVELS, znode->level);
err = 1;
goto out_dump;
}
@@ -316,7 +316,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
if (zbr->lnum < c->main_first ||
zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
- dbg_err("bad branch %d", i);
+ ubifs_err("bad branch %d", i);
err = 2;
goto out_dump;
}
@@ -340,19 +340,19 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
type = key_type(c, &zbr->key);
if (c->ranges[type].max_len == 0) {
if (zbr->len != c->ranges[type].len) {
- dbg_err("bad target node (type %d) length (%d)",
- type, zbr->len);
- dbg_err("have to be %d", c->ranges[type].len);
+ ubifs_err("bad target node (type %d) length (%d)",
+ type, zbr->len);
+ ubifs_err("have to be %d", c->ranges[type].len);
err = 4;
goto out_dump;
}
} else if (zbr->len < c->ranges[type].min_len ||
zbr->len > c->ranges[type].max_len) {
- dbg_err("bad target node (type %d) length (%d)",
- type, zbr->len);
- dbg_err("have to be in range of %d-%d",
- c->ranges[type].min_len,
- c->ranges[type].max_len);
+ ubifs_err("bad target node (type %d) length (%d)",
+ type, zbr->len);
+ ubifs_err("have to be in range of %d-%d",
+ c->ranges[type].min_len,
+ c->ranges[type].max_len);
err = 5;
goto out_dump;
}
@@ -370,13 +370,13 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
cmp = keys_cmp(c, key1, key2);
if (cmp > 0) {
- dbg_err("bad key order (keys %d and %d)", i, i + 1);
+ ubifs_err("bad key order (keys %d and %d)", i, i + 1);
err = 6;
goto out_dump;
} else if (cmp == 0 && !is_hash_key(c, key1)) {
/* These can only be keys with colliding hash */
- dbg_err("keys %d and %d are not hashed but equivalent",
- i, i + 1);
+ ubifs_err("keys %d and %d are not hashed but equivalent",
+ i, i + 1);
err = 7;
goto out_dump;
}
@@ -387,7 +387,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
out_dump:
ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
- dbg_dump_node(c, idx);
+ ubifs_dump_node(c, idx);
kfree(idx);
return -EINVAL;
}
@@ -486,7 +486,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
zbr->lnum, zbr->offs);
dbg_tnck(key, "looked for key ");
dbg_tnck(&key1, "but found node's key ");
- dbg_dump_node(c, node);
+ ubifs_dump_node(c, node);
return -EINVAL;
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 93d59aceaae..1e5a08623d1 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -650,8 +650,6 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
* @avail: number of bytes available in the write-buffer
* @used: number of used bytes in the write-buffer
* @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
- * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
- * %UBI_UNKNOWN)
* @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
* up by 'mutex_lock_nested()).
* @sync_callback: write-buffer synchronization callback
@@ -685,7 +683,6 @@ struct ubifs_wbuf {
int avail;
int used;
int size;
- int dtype;
int jhead;
int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
struct mutex io_mutex;
@@ -762,6 +759,9 @@ struct ubifs_zbranch {
* @offs: offset of the corresponding indexing node
* @len: length of the corresponding indexing node
* @zbranch: array of znode branches (@c->fanout elements)
+ *
+ * Note! The @lnum, @offs, and @len fields are not really needed - we have them
+ * only for internal consistency check. They could be removed to save some RAM.
*/
struct ubifs_znode {
struct ubifs_znode *parent;
@@ -772,9 +772,9 @@ struct ubifs_znode {
int child_cnt;
int iip;
int alt;
-#ifdef CONFIG_UBIFS_FS_DEBUG
- int lnum, offs, len;
-#endif
+ int lnum;
+ int offs;
+ int len;
struct ubifs_zbranch zbranch[];
};
@@ -1444,9 +1444,7 @@ struct ubifs_info {
struct rb_root size_tree;
struct ubifs_mount_opts mount_opts;
-#ifdef CONFIG_UBIFS_FS_DEBUG
struct ubifs_debug_info *dbg;
-#endif
};
extern struct list_head ubifs_infos;
@@ -1468,22 +1466,20 @@ void ubifs_ro_mode(struct ubifs_info *c, int err);
int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
int len, int even_ebadmsg);
int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
- int len, int dtype);
-int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
- int dtype);
+ int len);
+int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len);
int ubifs_leb_unmap(struct ubifs_info *c, int lnum);
-int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype);
+int ubifs_leb_map(struct ubifs_info *c, int lnum);
int ubifs_is_mapped(const struct ubifs_info *c, int lnum);
int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
-int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
- int dtype);
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs);
int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
int lnum, int offs);
int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
int lnum, int offs);
int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
- int offs, int dtype);
+ int offs);
int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
int offs, int quiet, int must_chk_crc);
void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 85b27226875..0f7139bdb2c 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -298,7 +298,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,
{
struct inode *inode, *host = dentry->d_inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
- struct qstr nm = { .name = name, .len = strlen(name) };
+ struct qstr nm = QSTR_INIT(name, strlen(name));
struct ubifs_dent_node *xent;
union ubifs_key key;
int err, type;
@@ -361,7 +361,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
{
struct inode *inode, *host = dentry->d_inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
- struct qstr nm = { .name = name, .len = strlen(name) };
+ struct qstr nm = QSTR_INIT(name, strlen(name));
struct ubifs_inode *ui;
struct ubifs_dent_node *xent;
union ubifs_key key;
@@ -399,8 +399,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
if (buf) {
/* If @buf is %NULL we are supposed to return the length */
if (ui->data_len > size) {
- dbg_err("buffer size %zd, xattr len %d",
- size, ui->data_len);
+ ubifs_err("buffer size %zd, xattr len %d",
+ size, ui->data_len);
err = -ERANGE;
goto out_iput;
}
@@ -524,7 +524,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
{
struct inode *inode, *host = dentry->d_inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
- struct qstr nm = { .name = name, .len = strlen(name) };
+ struct qstr nm = QSTR_INIT(name, strlen(name));
struct ubifs_dent_node *xent;
union ubifs_key key;
int err;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 38de8f234b9..a165c66e3ee 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1193,7 +1193,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
{
struct kernel_lb_addr tloc;
struct inode *inode = NULL;
- struct qstr dotdot = {.name = "..", .len = 2};
+ struct qstr dotdot = QSTR_INIT("..", 2);
struct fileIdentDesc cfi;
struct udf_fileident_bh fibh;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index ac8e279eccc..302f340d007 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -146,10 +146,7 @@ static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid,
static struct dentry *ufs_get_parent(struct dentry *child)
{
- struct qstr dot_dot = {
- .name = "..",
- .len = 2,
- };
+ struct qstr dot_dot = QSTR_INIT("..", 2);
ino_t ino;
ino = ufs_inode_by_name(child->d_inode, &dot_dot);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0a9977983f9..d2bf974b1a2 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-y += xfs_aops.o \
xfs_discard.o \
xfs_error.o \
xfs_export.o \
+ xfs_extent_busy.o \
xfs_file.o \
xfs_filestream.o \
xfs_fsops.o \
@@ -49,7 +50,6 @@ xfs-y += xfs_aops.o \
xfs_sync.o \
xfs_xattr.o \
xfs_rename.o \
- xfs_rw.o \
xfs_utils.o \
xfs_vnodeops.o \
kmem.o \
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4805f009f92..44d65c1533c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,24 +175,6 @@ typedef struct xfs_agfl {
} xfs_agfl_t;
/*
- * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
- * have been freed but whose transactions aren't committed to disk yet.
- *
- * Note that we use the transaction ID to record the transaction, not the
- * transaction structure itself. See xfs_alloc_busy_insert() for details.
- */
-struct xfs_busy_extent {
- struct rb_node rb_node; /* ag by-bno indexed search tree */
- struct list_head list; /* transaction busy extent list */
- xfs_agnumber_t agno;
- xfs_agblock_t bno;
- xfs_extlen_t length;
- unsigned int flags;
-#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
-#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */
-};
-
-/*
* Per-ag incore structure, copies of information in agf and agi,
* to improve the performance of allocation group selection.
*/
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 0f0df2759b0..229641fb8e6 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -32,6 +31,7 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -47,8 +47,6 @@ STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
-STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
- xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
@@ -152,7 +150,7 @@ xfs_alloc_compute_aligned(
xfs_extlen_t len;
/* Trim busy sections out of found extent */
- xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
+ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
if (args->alignment > 1 && len >= args->minlen) {
xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
@@ -536,7 +534,7 @@ xfs_alloc_ag_vextent(
if (error)
return error;
- ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
+ ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
args->agbno, args->len));
}
@@ -603,7 +601,7 @@ xfs_alloc_ag_vextent_exact(
/*
* Check for overlapping busy extents.
*/
- xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+ xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
/*
* Give up if the start of the extent is busy, or the freespace isn't
@@ -1391,7 +1389,7 @@ xfs_alloc_ag_vextent_small(
if (error)
goto error0;
if (fbno != NULLAGBLOCK) {
- xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+ xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
args->userdata);
if (args->userdata) {
@@ -2496,579 +2494,8 @@ xfs_free_extent(
error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
if (!error)
- xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
+ xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
error0:
xfs_perag_put(args.pag);
return error;
}
-
-void
-xfs_alloc_busy_insert(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- unsigned int flags)
-{
- struct xfs_busy_extent *new;
- struct xfs_busy_extent *busyp;
- struct xfs_perag *pag;
- struct rb_node **rbp;
- struct rb_node *parent = NULL;
-
- new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
- if (!new) {
- /*
- * No Memory! Since it is now not possible to track the free
- * block, make this a synchronous transaction to insure that
- * the block is not reused before this transaction commits.
- */
- trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
- xfs_trans_set_sync(tp);
- return;
- }
-
- new->agno = agno;
- new->bno = bno;
- new->length = len;
- INIT_LIST_HEAD(&new->list);
- new->flags = flags;
-
- /* trace before insert to be able to see failed inserts */
- trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
-
- pag = xfs_perag_get(tp->t_mountp, new->agno);
- spin_lock(&pag->pagb_lock);
- rbp = &pag->pagb_tree.rb_node;
- while (*rbp) {
- parent = *rbp;
- busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
-
- if (new->bno < busyp->bno) {
- rbp = &(*rbp)->rb_left;
- ASSERT(new->bno + new->length <= busyp->bno);
- } else if (new->bno > busyp->bno) {
- rbp = &(*rbp)->rb_right;
- ASSERT(bno >= busyp->bno + busyp->length);
- } else {
- ASSERT(0);
- }
- }
-
- rb_link_node(&new->rb_node, parent, rbp);
- rb_insert_color(&new->rb_node, &pag->pagb_tree);
-
- list_add(&new->list, &tp->t_busy);
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
-}
-
-/*
- * Search for a busy extent within the range of the extent we are about to
- * allocate. You need to be holding the busy extent tree lock when calling
- * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
- * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
- * match. This is done so that a non-zero return indicates an overlap that
- * will require a synchronous transaction, but it can still be
- * used to distinguish between a partial or exact match.
- */
-int
-xfs_alloc_busy_search(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
-{
- struct xfs_perag *pag;
- struct rb_node *rbp;
- struct xfs_busy_extent *busyp;
- int match = 0;
-
- pag = xfs_perag_get(mp, agno);
- spin_lock(&pag->pagb_lock);
-
- rbp = pag->pagb_tree.rb_node;
-
- /* find closest start bno overlap */
- while (rbp) {
- busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
- if (bno < busyp->bno) {
- /* may overlap, but exact start block is lower */
- if (bno + len > busyp->bno)
- match = -1;
- rbp = rbp->rb_left;
- } else if (bno > busyp->bno) {
- /* may overlap, but exact start block is higher */
- if (bno < busyp->bno + busyp->length)
- match = -1;
- rbp = rbp->rb_right;
- } else {
- /* bno matches busyp, length determines exact match */
- match = (busyp->length == len) ? 1 : -1;
- break;
- }
- }
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
- return match;
-}
-
-/*
- * The found free extent [fbno, fend] overlaps part or all of the given busy
- * extent. If the overlap covers the beginning, the end, or all of the busy
- * extent, the overlapping portion can be made unbusy and used for the
- * allocation. We can't split a busy extent because we can't modify a
- * transaction/CIL context busy list, but we can update an entries block
- * number or length.
- *
- * Returns true if the extent can safely be reused, or false if the search
- * needs to be restarted.
- */
-STATIC bool
-xfs_alloc_busy_update_extent(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- struct xfs_busy_extent *busyp,
- xfs_agblock_t fbno,
- xfs_extlen_t flen,
- bool userdata)
-{
- xfs_agblock_t fend = fbno + flen;
- xfs_agblock_t bbno = busyp->bno;
- xfs_agblock_t bend = bbno + busyp->length;
-
- /*
- * This extent is currently being discarded. Give the thread
- * performing the discard a chance to mark the extent unbusy
- * and retry.
- */
- if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
- spin_unlock(&pag->pagb_lock);
- delay(1);
- spin_lock(&pag->pagb_lock);
- return false;
- }
-
- /*
- * If there is a busy extent overlapping a user allocation, we have
- * no choice but to force the log and retry the search.
- *
- * Fortunately this does not happen during normal operation, but
- * only if the filesystem is very low on space and has to dip into
- * the AGFL for normal allocations.
- */
- if (userdata)
- goto out_force_log;
-
- if (bbno < fbno && bend > fend) {
- /*
- * Case 1:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +---------+
- * fbno fend
- */
-
- /*
- * We would have to split the busy extent to be able to track
- * it correct, which we cannot do because we would have to
- * modify the list of busy extents attached to the transaction
- * or CIL context, which is immutable.
- *
- * Force out the log to clear the busy extent and retry the
- * search.
- */
- goto out_force_log;
- } else if (bbno >= fbno && bend <= fend) {
- /*
- * Case 2:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------+
- * fbno fend
- *
- * Case 3:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Case 4:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Case 5:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------------------------+
- * fbno fend
- *
- */
-
- /*
- * The busy extent is fully covered by the extent we are
- * allocating, and can simply be removed from the rbtree.
- * However we cannot remove it from the immutable list
- * tracking busy extents in the transaction or CIL context,
- * so set the length to zero to mark it invalid.
- *
- * We also need to restart the busy extent search from the
- * tree root, because erasing the node can rearrange the
- * tree topology.
- */
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
- busyp->length = 0;
- return false;
- } else if (fend < bend) {
- /*
- * Case 6:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +---------+
- * fbno fend
- *
- * Case 7:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +------------------+
- * fbno fend
- *
- */
- busyp->bno = fend;
- } else if (bbno < fbno) {
- /*
- * Case 8:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-------------+
- * fbno fend
- *
- * Case 9:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +----------------------+
- * fbno fend
- */
- busyp->length = fbno - busyp->bno;
- } else {
- ASSERT(0);
- }
-
- trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
- return true;
-
-out_force_log:
- spin_unlock(&pag->pagb_lock);
- xfs_log_force(mp, XFS_LOG_SYNC);
- trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
- spin_lock(&pag->pagb_lock);
- return false;
-}
-
-
-/*
- * For a given extent [fbno, flen], make sure we can reuse it safely.
- */
-void
-xfs_alloc_busy_reuse(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agblock_t fbno,
- xfs_extlen_t flen,
- bool userdata)
-{
- struct xfs_perag *pag;
- struct rb_node *rbp;
-
- ASSERT(flen > 0);
-
- pag = xfs_perag_get(mp, agno);
- spin_lock(&pag->pagb_lock);
-restart:
- rbp = pag->pagb_tree.rb_node;
- while (rbp) {
- struct xfs_busy_extent *busyp =
- rb_entry(rbp, struct xfs_busy_extent, rb_node);
- xfs_agblock_t bbno = busyp->bno;
- xfs_agblock_t bend = bbno + busyp->length;
-
- if (fbno + flen <= bbno) {
- rbp = rbp->rb_left;
- continue;
- } else if (fbno >= bend) {
- rbp = rbp->rb_right;
- continue;
- }
-
- if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
- userdata))
- goto restart;
- }
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
-}
-
-/*
- * For a given extent [fbno, flen], search the busy extent list to find a
- * subset of the extent that is not busy. If *rlen is smaller than
- * args->minlen no suitable extent could be found, and the higher level
- * code needs to force out the log and retry the allocation.
- */
-STATIC void
-xfs_alloc_busy_trim(
- struct xfs_alloc_arg *args,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- xfs_agblock_t *rbno,
- xfs_extlen_t *rlen)
-{
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
- struct rb_node *rbp;
-
- ASSERT(len > 0);
-
- spin_lock(&args->pag->pagb_lock);
-restart:
- fbno = bno;
- flen = len;
- rbp = args->pag->pagb_tree.rb_node;
- while (rbp && flen >= args->minlen) {
- struct xfs_busy_extent *busyp =
- rb_entry(rbp, struct xfs_busy_extent, rb_node);
- xfs_agblock_t fend = fbno + flen;
- xfs_agblock_t bbno = busyp->bno;
- xfs_agblock_t bend = bbno + busyp->length;
-
- if (fend <= bbno) {
- rbp = rbp->rb_left;
- continue;
- } else if (fbno >= bend) {
- rbp = rbp->rb_right;
- continue;
- }
-
- /*
- * If this is a metadata allocation, try to reuse the busy
- * extent instead of trimming the allocation.
- */
- if (!args->userdata &&
- !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
- if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
- busyp, fbno, flen,
- false))
- goto restart;
- continue;
- }
-
- if (bbno <= fbno) {
- /* start overlap */
-
- /*
- * Case 1:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +---------+
- * fbno fend
- *
- * Case 2:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-------------+
- * fbno fend
- *
- * Case 3:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-------------+
- * fbno fend
- *
- * Case 4:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------+
- * fbno fend
- *
- * No unbusy region in extent, return failure.
- */
- if (fend <= bend)
- goto fail;
-
- /*
- * Case 5:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +----------------------+
- * fbno fend
- *
- * Case 6:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Needs to be trimmed to:
- * +-------+
- * fbno fend
- */
- fbno = bend;
- } else if (bend >= fend) {
- /* end overlap */
-
- /*
- * Case 7:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +------------------+
- * fbno fend
- *
- * Case 8:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Needs to be trimmed to:
- * +-------+
- * fbno fend
- */
- fend = bbno;
- } else {
- /* middle overlap */
-
- /*
- * Case 9:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------------------------+
- * fbno fend
- *
- * Can be trimmed to:
- * +-------+ OR +-------+
- * fbno fend fbno fend
- *
- * Backward allocation leads to significant
- * fragmentation of directories, which degrades
- * directory performance, therefore we always want to
- * choose the option that produces forward allocation
- * patterns.
- * Preferring the lower bno extent will make the next
- * request use "fend" as the start of the next
- * allocation; if the segment is no longer busy at
- * that point, we'll get a contiguous allocation, but
- * even if it is still busy, we will get a forward
- * allocation.
- * We try to avoid choosing the segment at "bend",
- * because that can lead to the next allocation
- * taking the segment at "fbno", which would be a
- * backward allocation. We only use the segment at
- * "fbno" if it is much larger than the current
- * requested size, because in that case there's a
- * good chance subsequent allocations will be
- * contiguous.
- */
- if (bbno - fbno >= args->maxlen) {
- /* left candidate fits perfect */
- fend = bbno;
- } else if (fend - bend >= args->maxlen * 4) {
- /* right candidate has enough free space */
- fbno = bend;
- } else if (bbno - fbno >= args->minlen) {
- /* left candidate fits minimum requirement */
- fend = bbno;
- } else {
- goto fail;
- }
- }
-
- flen = fend - fbno;
- }
- spin_unlock(&args->pag->pagb_lock);
-
- if (fbno != bno || flen != len) {
- trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
- fbno, flen);
- }
- *rbno = fbno;
- *rlen = flen;
- return;
-fail:
- /*
- * Return a zero extent length as failure indications. All callers
- * re-check if the trimmed extent satisfies the minlen requirement.
- */
- spin_unlock(&args->pag->pagb_lock);
- trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
- *rbno = fbno;
- *rlen = 0;
-}
-
-static void
-xfs_alloc_busy_clear_one(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- struct xfs_busy_extent *busyp)
-{
- if (busyp->length) {
- trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
- busyp->length);
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
- }
-
- list_del_init(&busyp->list);
- kmem_free(busyp);
-}
-
-/*
- * Remove all extents on the passed in list from the busy extents tree.
- * If do_discard is set skip extents that need to be discarded, and mark
- * these as undergoing a discard operation instead.
- */
-void
-xfs_alloc_busy_clear(
- struct xfs_mount *mp,
- struct list_head *list,
- bool do_discard)
-{
- struct xfs_busy_extent *busyp, *n;
- struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno = NULLAGNUMBER;
-
- list_for_each_entry_safe(busyp, n, list, list) {
- if (busyp->agno != agno) {
- if (pag) {
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
- }
- pag = xfs_perag_get(mp, busyp->agno);
- spin_lock(&pag->pagb_lock);
- agno = busyp->agno;
- }
-
- if (do_discard && busyp->length &&
- !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
- busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
- else
- xfs_alloc_busy_clear_one(mp, pag, busyp);
- }
-
- if (pag) {
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
- }
-}
-
-/*
- * Callback for list_sort to sort busy extents by the AG they reside in.
- */
-int
-xfs_busy_extent_ag_cmp(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- return container_of(a, struct xfs_busy_extent, list)->agno -
- container_of(b, struct xfs_busy_extent, list)->agno;
-}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 3a7e7d8f8de..93be4a667ca 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -23,7 +23,6 @@ struct xfs_btree_cur;
struct xfs_mount;
struct xfs_perag;
struct xfs_trans;
-struct xfs_busy_extent;
extern struct workqueue_struct *xfs_alloc_wq;
@@ -139,33 +138,6 @@ xfs_extlen_t
xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag);
-#ifdef __KERNEL__
-void
-xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
-
-void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
- bool do_discard);
-
-int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len);
-
-void
-xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
-
-int
-xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
-
-static inline void xfs_alloc_busy_sort(struct list_head *list)
-{
- list_sort(NULL, list, xfs_busy_extent_ag_cmp);
-}
-
-#endif /* __KERNEL__ */
-
/*
* Compute and fill in value of m_ag_maxlevels.
*/
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index ffb3386e45c..f1647caace8 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -32,6 +30,7 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -94,7 +93,7 @@ xfs_allocbt_alloc_block(
return 0;
}
- xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
@@ -119,8 +118,8 @@ xfs_allocbt_free_block(
if (error)
return error;
- xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
- XFS_ALLOC_BUSY_SKIP_DISCARD);
+ xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
return 0;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0dbb9e70fe2..ae31c313a79 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -16,9 +16,7 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_trans.h"
@@ -29,7 +27,6 @@
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_iomap.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
@@ -623,7 +620,7 @@ xfs_map_at_offset(
* or delayed allocate extent.
*/
STATIC int
-xfs_is_delayed_page(
+xfs_check_page_type(
struct page *page,
unsigned int type)
{
@@ -637,11 +634,11 @@ xfs_is_delayed_page(
bh = head = page_buffers(page);
do {
if (buffer_unwritten(bh))
- acceptable = (type == IO_UNWRITTEN);
+ acceptable += (type == IO_UNWRITTEN);
else if (buffer_delay(bh))
- acceptable = (type == IO_DELALLOC);
+ acceptable += (type == IO_DELALLOC);
else if (buffer_dirty(bh) && buffer_mapped(bh))
- acceptable = (type == IO_OVERWRITE);
+ acceptable += (type == IO_OVERWRITE);
else
break;
} while ((bh = bh->b_this_page) != head);
@@ -684,7 +681,7 @@ xfs_convert_page(
goto fail_unlock_page;
if (page->mapping != inode->i_mapping)
goto fail_unlock_page;
- if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
+ if (!xfs_check_page_type(page, (*ioendp)->io_type))
goto fail_unlock_page;
/*
@@ -834,7 +831,7 @@ xfs_aops_discard_page(
struct buffer_head *bh, *head;
loff_t offset = page_offset(page);
- if (!xfs_is_delayed_page(page, IO_DELALLOC))
+ if (!xfs_check_page_type(page, IO_DELALLOC))
goto out_invalidate;
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1146,7 +1143,14 @@ __xfs_get_blocks(
if (!create && direct && offset >= i_size_read(inode))
return 0;
- if (create) {
+ /*
+ * Direct I/O is usually done on preallocated files, so try getting
+ * a block mapping without an exclusive lock first. For buffered
+ * writes we already have the exclusive iolock anyway, so avoiding
+ * a lock roundtrip here by taking the ilock exclusive from the
+ * beginning is a useful micro optimization.
+ */
+ if (create && !direct) {
lockmode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lockmode);
} else {
@@ -1168,23 +1172,45 @@ __xfs_get_blocks(
(!nimaps ||
(imap.br_startblock == HOLESTARTBLOCK ||
imap.br_startblock == DELAYSTARTBLOCK))) {
- if (direct) {
+ if (direct || xfs_get_extsz_hint(ip)) {
+ /*
+ * Drop the ilock in preparation for starting the block
+ * allocation transaction. It will be retaken
+ * exclusively inside xfs_iomap_write_direct for the
+ * actual allocation.
+ */
+ xfs_iunlock(ip, lockmode);
error = xfs_iomap_write_direct(ip, offset, size,
&imap, nimaps);
+ if (error)
+ return -error;
+ new = 1;
} else {
+ /*
+ * Delalloc reservations do not require a transaction,
+ * we can go on without dropping the lock here. If we
+ * are allocating a new delalloc block, make sure that
+ * we set the new flag so that we mark the buffer new so
+ * that we know that it is newly allocated if the write
+ * fails.
+ */
+ if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+ new = 1;
error = xfs_iomap_write_delay(ip, offset, size, &imap);
+ if (error)
+ goto out_unlock;
+
+ xfs_iunlock(ip, lockmode);
}
- if (error)
- goto out_unlock;
trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
} else if (nimaps) {
trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+ xfs_iunlock(ip, lockmode);
} else {
trace_xfs_get_blocks_notfound(ip, offset, size);
goto out_unlock;
}
- xfs_iunlock(ip, lockmode);
if (imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1386,52 +1412,91 @@ out_destroy_ioend:
return ret;
}
+/*
+ * Punch out the delalloc blocks we have already allocated.
+ *
+ * Don't bother with xfs_setattr given that nothing can have made it to disk yet
+ * as the page is still locked at this point.
+ */
+STATIC void
+xfs_vm_kill_delalloc_range(
+ struct inode *inode,
+ loff_t start,
+ loff_t end)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t end_fsb;
+ int error;
+
+ start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
+ if (end_fsb <= start_fsb)
+ return;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+ end_fsb - start_fsb);
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_alert(ip->i_mount,
+ "xfs_vm_write_failed: unable to clean up ino %lld",
+ ip->i_ino);
+ }
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
STATIC void
xfs_vm_write_failed(
- struct address_space *mapping,
- loff_t to)
+ struct inode *inode,
+ struct page *page,
+ loff_t pos,
+ unsigned len)
{
- struct inode *inode = mapping->host;
+ loff_t block_offset = pos & PAGE_MASK;
+ loff_t block_start;
+ loff_t block_end;
+ loff_t from = pos & (PAGE_CACHE_SIZE - 1);
+ loff_t to = from + len;
+ struct buffer_head *bh, *head;
- if (to > inode->i_size) {
- /*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have
- * made it to disk yet as the page is still locked at this
- * point.
- */
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fileoff_t start_fsb;
- xfs_fileoff_t end_fsb;
- int error;
+ ASSERT(block_offset + from == pos);
- truncate_pagecache(inode, to, inode->i_size);
+ head = page_buffers(page);
+ block_start = 0;
+ for (bh = head; bh != head || !block_start;
+ bh = bh->b_this_page, block_start = block_end,
+ block_offset += bh->b_size) {
+ block_end = block_start + bh->b_size;
- /*
- * Check if there are any blocks that are outside of i_size
- * that need to be trimmed back.
- */
- start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
- end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
- if (end_fsb <= start_fsb)
- return;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- end_fsb - start_fsb);
- if (error) {
- /* something screwed, just bail */
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_alert(ip->i_mount,
- "xfs_vm_write_failed: unable to clean up ino %lld",
- ip->i_ino);
- }
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* skip buffers before the write */
+ if (block_end <= from)
+ continue;
+
+ /* if the buffer is after the write, we're done */
+ if (block_start >= to)
+ break;
+
+ if (!buffer_delay(bh))
+ continue;
+
+ if (!buffer_new(bh) && block_offset < i_size_read(inode))
+ continue;
+
+ xfs_vm_kill_delalloc_range(inode, block_offset,
+ block_offset + bh->b_size);
}
+
}
+/*
+ * This used to call block_write_begin(), but it unlocks and releases the page
+ * on error, and we need that page to be able to punch stale delalloc blocks out
+ * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
+ * the appropriate point.
+ */
STATIC int
xfs_vm_write_begin(
struct file *file,
@@ -1442,15 +1507,40 @@ xfs_vm_write_begin(
struct page **pagep,
void **fsdata)
{
- int ret;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int status;
- ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
- pagep, xfs_get_blocks);
- if (unlikely(ret))
- xfs_vm_write_failed(mapping, pos + len);
- return ret;
+ ASSERT(len <= PAGE_CACHE_SIZE);
+
+ page = grab_cache_page_write_begin(mapping, index,
+ flags | AOP_FLAG_NOFS);
+ if (!page)
+ return -ENOMEM;
+
+ status = __block_write_begin(page, pos, len, xfs_get_blocks);
+ if (unlikely(status)) {
+ struct inode *inode = mapping->host;
+
+ xfs_vm_write_failed(inode, page, pos, len);
+ unlock_page(page);
+
+ if (pos + len > i_size_read(inode))
+ truncate_pagecache(inode, pos + len, i_size_read(inode));
+
+ page_cache_release(page);
+ page = NULL;
+ }
+
+ *pagep = page;
+ return status;
}
+/*
+ * On failure, we only need to kill delalloc blocks beyond EOF because they
+ * will never be written. For blocks within EOF, generic_write_end() zeros them
+ * so they are safe to leave alone and be written with all the other valid data.
+ */
STATIC int
xfs_vm_write_end(
struct file *file,
@@ -1463,9 +1553,19 @@ xfs_vm_write_end(
{
int ret;
+ ASSERT(len <= PAGE_CACHE_SIZE);
+
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (unlikely(ret < len))
- xfs_vm_write_failed(mapping, pos + len);
+ if (unlikely(ret < len)) {
+ struct inode *inode = mapping->host;
+ size_t isize = i_size_read(inode);
+ loff_t to = pos + len;
+
+ if (to > isize) {
+ truncate_pagecache(inode, to, isize);
+ xfs_vm_kill_delalloc_range(inode, isize, to);
+ }
+ }
return ret;
}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 65d61b948ea..a17ff01b5ad 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -21,7 +21,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -39,7 +38,6 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
-#include "xfs_rw.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
@@ -1987,14 +1985,12 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
(map[i].br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
- blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
- &bp);
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ dblkno, blkcnt, 0, &bp);
if (error)
return(error);
- tmp = (valuelen < XFS_BUF_SIZE(bp))
- ? valuelen : XFS_BUF_SIZE(bp);
+ tmp = min_t(int, valuelen, BBTOB(bp->b_length));
xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
xfs_buf_relse(bp);
dst += tmp;
@@ -2097,6 +2093,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
lblkno = args->rmtblkno;
valuelen = args->valuelen;
while (valuelen > 0) {
+ int buflen;
+
/*
* Try to remember where we decided to put the value.
*/
@@ -2114,15 +2112,16 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
- bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
- XBF_LOCK | XBF_DONT_BLOCK);
+ bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0);
if (!bp)
return ENOMEM;
- tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
- XFS_BUF_SIZE(bp);
+
+ buflen = BBTOB(bp->b_length);
+ tmp = min_t(int, valuelen, buflen);
xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
- if (tmp < XFS_BUF_SIZE(bp))
- xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+ if (tmp < buflen)
+ xfs_buf_zero(bp, tmp, buflen - tmp);
+
error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
xfs_buf_relse(bp);
if (error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 76d93dc953e..7d89d800f51 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -2983,7 +2982,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
map.br_blockcount);
bp = xfs_trans_get_buf(*trans,
dp->i_mount->m_ddev_targp,
- dblkno, dblkcnt, XBF_LOCK);
+ dblkno, dblkcnt, 0);
if (!bp)
return ENOMEM;
xfs_trans_binval(*trans, bp);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 85e7e327bcd..58b815ec8c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -41,7 +41,6 @@
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_attr_leaf.h"
-#include "xfs_rw.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_buf_item.h"
@@ -4527,7 +4526,7 @@ out_unreserve_blocks:
xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
- xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ?
+ xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
return error;
}
@@ -5621,8 +5620,20 @@ xfs_getbmap(
XFS_FSB_TO_BB(mp, map[i].br_blockcount);
out[cur_ext].bmv_unused1 = 0;
out[cur_ext].bmv_unused2 = 0;
- ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
- (map[i].br_startblock != DELAYSTARTBLOCK));
+
+ /*
+ * delayed allocation extents that start beyond EOF can
+ * occur due to speculative EOF allocation when the
+ * delalloc extent is larger than the largest freespace
+ * extent at conversion time. These extents cannot be
+ * converted by data writeback, so can exist here even
+ * if we are not supposed to be finding delalloc
+ * extents.
+ */
+ if (map[i].br_startblock == DELAYSTARTBLOCK &&
+ map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+ ASSERT((iflags & BMV_IF_DELALLOC) != 0);
+
if (map[i].br_startblock == HOLESTARTBLOCK &&
whichfork == XFS_ATTR_FORK) {
/* came to the end of attribute fork */
@@ -6157,3 +6168,16 @@ next_block:
return error;
}
+
+/*
+ * Convert the given file system block to a disk block. We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
+ */
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+ return (XFS_IS_REALTIME_INODE(ip) ? \
+ (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+ XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 89ee672d378..803b56d7ce1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -211,6 +211,9 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, int *count);
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+
#endif /* __KERNEL__ */
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index e2f5d59cbea..862084a47a7 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 1f19f03af9d..e53e317b158 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 6819b5163e3..172d3cc8f8c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -35,14 +35,12 @@
#include <linux/freezer.h>
#include "xfs_sb.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
static kmem_zone_t *xfs_buf_zone;
-STATIC int xfsbufd(void *);
static struct workqueue_struct *xfslogd_workqueue;
@@ -57,11 +55,7 @@ static struct workqueue_struct *xfslogd_workqueue;
#endif
#define xb_to_gfp(flags) \
- ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
- ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
-
-#define xb_to_km(flags) \
- (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
+ ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
static inline int
@@ -71,11 +65,11 @@ xfs_buf_is_vmapped(
/*
* Return true if the buffer is vmapped.
*
- * The XBF_MAPPED flag is set if the buffer should be mapped, but the
- * code is clever enough to know it doesn't have to map a single page,
- * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
+ * b_addr is null if the buffer is not mapped, but the code is clever
+ * enough to know it doesn't have to map a single page, so the check has
+ * to be both for b_addr and bp->b_page_count > 1.
*/
- return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
+ return bp->b_addr && bp->b_page_count > 1;
}
static inline int
@@ -144,8 +138,17 @@ void
xfs_buf_stale(
struct xfs_buf *bp)
{
+ ASSERT(xfs_buf_islocked(bp));
+
bp->b_flags |= XBF_STALE;
- xfs_buf_delwri_dequeue(bp);
+
+ /*
+ * Clear the delwri status so that a delwri queue walker will not
+ * flush this buffer to disk now that it is stale. The delwri queue has
+ * a reference to the buffer, so this is safe to do.
+ */
+ bp->b_flags &= ~_XBF_DELWRI_Q;
+
atomic_set(&(bp)->b_lru_ref, 0);
if (!list_empty(&bp->b_lru)) {
struct xfs_buftarg *btp = bp->b_target;
@@ -164,22 +167,22 @@ xfs_buf_stale(
struct xfs_buf *
xfs_buf_alloc(
struct xfs_buftarg *target,
- xfs_off_t range_base,
- size_t range_length,
+ xfs_daddr_t blkno,
+ size_t numblks,
xfs_buf_flags_t flags)
{
struct xfs_buf *bp;
- bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
+ bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
if (unlikely(!bp))
return NULL;
/*
- * We don't want certain flags to appear in b_flags.
+ * We don't want certain flags to appear in b_flags unless they are
+ * specifically set by later operations on the buffer.
*/
- flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
+ flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
- memset(bp, 0, sizeof(xfs_buf_t));
atomic_set(&bp->b_hold, 1);
atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait);
@@ -189,14 +192,22 @@ xfs_buf_alloc(
sema_init(&bp->b_sema, 0); /* held, no waiters */
XB_SET_OWNER(bp);
bp->b_target = target;
- bp->b_file_offset = range_base;
+
/*
- * Set buffer_length and count_desired to the same value initially.
- * I/O routines should use count_desired, which will be the same in
+ * Set length and io_length to the same value initially.
+ * I/O routines should use io_length, which will be the same in
* most cases but may be reset (e.g. XFS recovery).
*/
- bp->b_buffer_length = bp->b_count_desired = range_length;
+ bp->b_length = numblks;
+ bp->b_io_length = numblks;
bp->b_flags = flags;
+
+ /*
+ * We do not set the block number here in the buffer because we have not
+ * finished initialising the buffer. We insert the buffer into the cache
+ * in this state, so this ensures that we are unable to do IO on a
+ * buffer that hasn't been fully initialised.
+ */
bp->b_bn = XFS_BUF_DADDR_NULL;
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
@@ -219,13 +230,12 @@ _xfs_buf_get_pages(
{
/* Make sure that we have a page list */
if (bp->b_pages == NULL) {
- bp->b_offset = xfs_buf_poff(bp->b_file_offset);
bp->b_page_count = page_count;
if (page_count <= XB_PAGES) {
bp->b_pages = bp->b_page_array;
} else {
bp->b_pages = kmem_alloc(sizeof(struct page *) *
- page_count, xb_to_km(flags));
+ page_count, KM_NOFS);
if (bp->b_pages == NULL)
return -ENOMEM;
}
@@ -288,11 +298,11 @@ xfs_buf_allocate_memory(
xfs_buf_t *bp,
uint flags)
{
- size_t size = bp->b_count_desired;
+ size_t size;
size_t nbytes, offset;
gfp_t gfp_mask = xb_to_gfp(flags);
unsigned short page_count, i;
- xfs_off_t end;
+ xfs_off_t start, end;
int error;
/*
@@ -300,15 +310,15 @@ xfs_buf_allocate_memory(
* the memory from the heap - there's no need for the complexity of
* page arrays to keep allocation down to order 0.
*/
- if (bp->b_buffer_length < PAGE_SIZE) {
- bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
+ size = BBTOB(bp->b_length);
+ if (size < PAGE_SIZE) {
+ bp->b_addr = kmem_alloc(size, KM_NOFS);
if (!bp->b_addr) {
/* low memory - use alloc_page loop instead */
goto use_alloc_page;
}
- if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
- PAGE_MASK) !=
+ if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
((unsigned long)bp->b_addr & PAGE_MASK)) {
/* b_addr spans two pages - use alloc_page instead */
kmem_free(bp->b_addr);
@@ -319,13 +329,14 @@ xfs_buf_allocate_memory(
bp->b_pages = bp->b_page_array;
bp->b_pages[0] = virt_to_page(bp->b_addr);
bp->b_page_count = 1;
- bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
+ bp->b_flags |= _XBF_KMEM;
return 0;
}
use_alloc_page:
- end = bp->b_file_offset + bp->b_buffer_length;
- page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
+ start = BBTOB(bp->b_bn) >> PAGE_SHIFT;
+ end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ page_count = end - start;
error = _xfs_buf_get_pages(bp, page_count, flags);
if (unlikely(error))
return error;
@@ -388,8 +399,9 @@ _xfs_buf_map_pages(
if (bp->b_page_count == 1) {
/* A single page buffer is always mappable */
bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
- bp->b_flags |= XBF_MAPPED;
- } else if (flags & XBF_MAPPED) {
+ } else if (flags & XBF_UNMAPPED) {
+ bp->b_addr = NULL;
+ } else {
int retried = 0;
do {
@@ -403,7 +415,6 @@ _xfs_buf_map_pages(
if (!bp->b_addr)
return -ENOMEM;
bp->b_addr += bp->b_offset;
- bp->b_flags |= XBF_MAPPED;
}
return 0;
@@ -420,29 +431,27 @@ _xfs_buf_map_pages(
*/
xfs_buf_t *
_xfs_buf_find(
- xfs_buftarg_t *btp, /* block device target */
- xfs_off_t ioff, /* starting offset of range */
- size_t isize, /* length of range */
+ struct xfs_buftarg *btp,
+ xfs_daddr_t blkno,
+ size_t numblks,
xfs_buf_flags_t flags,
xfs_buf_t *new_bp)
{
- xfs_off_t range_base;
- size_t range_length;
+ size_t numbytes;
struct xfs_perag *pag;
struct rb_node **rbp;
struct rb_node *parent;
xfs_buf_t *bp;
- range_base = (ioff << BBSHIFT);
- range_length = (isize << BBSHIFT);
+ numbytes = BBTOB(numblks);
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(range_length < (1 << btp->bt_sshift)));
- ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
+ ASSERT(!(numbytes < (1 << btp->bt_sshift)));
+ ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
/* get tree root */
pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, ioff));
+ xfs_daddr_to_agno(btp->bt_mount, blkno));
/* walk tree */
spin_lock(&pag->pag_buf_lock);
@@ -453,20 +462,20 @@ _xfs_buf_find(
parent = *rbp;
bp = rb_entry(parent, struct xfs_buf, b_rbnode);
- if (range_base < bp->b_file_offset)
+ if (blkno < bp->b_bn)
rbp = &(*rbp)->rb_left;
- else if (range_base > bp->b_file_offset)
+ else if (blkno > bp->b_bn)
rbp = &(*rbp)->rb_right;
else {
/*
- * found a block offset match. If the range doesn't
+ * found a block number match. If the range doesn't
* match, the only way this is allowed is if the buffer
* in the cache is stale and the transaction that made
* it stale has not yet committed. i.e. we are
* reallocating a busy extent. Skip this buffer and
* continue searching to the right for an exact match.
*/
- if (bp->b_buffer_length != range_length) {
+ if (bp->b_length != numblks) {
ASSERT(bp->b_flags & XBF_STALE);
rbp = &(*rbp)->rb_right;
continue;
@@ -511,7 +520,7 @@ found:
*/
if (bp->b_flags & XBF_STALE) {
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
- bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
+ bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -526,63 +535,59 @@ found:
*/
struct xfs_buf *
xfs_buf_get(
- xfs_buftarg_t *target,/* target for buffer */
- xfs_off_t ioff, /* starting offset of range */
- size_t isize, /* length of range */
+ xfs_buftarg_t *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
xfs_buf_flags_t flags)
{
struct xfs_buf *bp;
struct xfs_buf *new_bp;
int error = 0;
- bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
+ bp = _xfs_buf_find(target, blkno, numblks, flags, NULL);
if (likely(bp))
goto found;
- new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
- flags);
+ new_bp = xfs_buf_alloc(target, blkno, numblks, flags);
if (unlikely(!new_bp))
return NULL;
- bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
- if (!bp) {
+ error = xfs_buf_allocate_memory(new_bp, flags);
+ if (error) {
kmem_zone_free(xfs_buf_zone, new_bp);
return NULL;
}
- if (bp == new_bp) {
- error = xfs_buf_allocate_memory(bp, flags);
- if (error)
- goto no_buffer;
- } else
- kmem_zone_free(xfs_buf_zone, new_bp);
+ bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp);
+ if (!bp) {
+ xfs_buf_free(new_bp);
+ return NULL;
+ }
+
+ if (bp != new_bp)
+ xfs_buf_free(new_bp);
/*
* Now we have a workable buffer, fill in the block number so
* that we can do IO on it.
*/
- bp->b_bn = ioff;
- bp->b_count_desired = bp->b_buffer_length;
+ bp->b_bn = blkno;
+ bp->b_io_length = bp->b_length;
found:
- if (!(bp->b_flags & XBF_MAPPED)) {
+ if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
"%s: failed to map pages\n", __func__);
- goto no_buffer;
+ xfs_buf_relse(bp);
+ return NULL;
}
}
XFS_STATS_INC(xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
-
-no_buffer:
- if (flags & (XBF_LOCK | XBF_TRYLOCK))
- xfs_buf_unlock(bp);
- xfs_buf_rele(bp);
- return NULL;
}
STATIC int
@@ -590,32 +595,30 @@ _xfs_buf_read(
xfs_buf_t *bp,
xfs_buf_flags_t flags)
{
- int status;
-
- ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+ ASSERT(!(flags & XBF_WRITE));
ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
- bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD);
+ bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
- status = xfs_buf_iorequest(bp);
- if (status || bp->b_error || (flags & XBF_ASYNC))
- return status;
+ xfs_buf_iorequest(bp);
+ if (flags & XBF_ASYNC)
+ return 0;
return xfs_buf_iowait(bp);
}
xfs_buf_t *
xfs_buf_read(
xfs_buftarg_t *target,
- xfs_off_t ioff,
- size_t isize,
+ xfs_daddr_t blkno,
+ size_t numblks,
xfs_buf_flags_t flags)
{
xfs_buf_t *bp;
flags |= XBF_READ;
- bp = xfs_buf_get(target, ioff, isize, flags);
+ bp = xfs_buf_get(target, blkno, numblks, flags);
if (bp) {
trace_xfs_buf_read(bp, flags, _RET_IP_);
@@ -627,7 +630,8 @@ xfs_buf_read(
* Read ahead call which is already satisfied,
* drop the buffer
*/
- goto no_buffer;
+ xfs_buf_relse(bp);
+ return NULL;
} else {
/* We do not want read in the flags */
bp->b_flags &= ~XBF_READ;
@@ -635,12 +639,6 @@ xfs_buf_read(
}
return bp;
-
- no_buffer:
- if (flags & (XBF_LOCK | XBF_TRYLOCK))
- xfs_buf_unlock(bp);
- xfs_buf_rele(bp);
- return NULL;
}
/*
@@ -650,14 +648,14 @@ xfs_buf_read(
void
xfs_buf_readahead(
xfs_buftarg_t *target,
- xfs_off_t ioff,
- size_t isize)
+ xfs_daddr_t blkno,
+ size_t numblks)
{
if (bdi_read_congested(target->bt_bdi))
return;
- xfs_buf_read(target, ioff, isize,
- XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+ xfs_buf_read(target, blkno, numblks,
+ XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
}
/*
@@ -666,16 +664,15 @@ xfs_buf_readahead(
*/
struct xfs_buf *
xfs_buf_read_uncached(
- struct xfs_mount *mp,
struct xfs_buftarg *target,
xfs_daddr_t daddr,
- size_t length,
+ size_t numblks,
int flags)
{
xfs_buf_t *bp;
int error;
- bp = xfs_buf_get_uncached(target, length, flags);
+ bp = xfs_buf_get_uncached(target, numblks, flags);
if (!bp)
return NULL;
@@ -683,9 +680,9 @@ xfs_buf_read_uncached(
XFS_BUF_SET_ADDR(bp, daddr);
XFS_BUF_READ(bp);
- xfsbdstrat(mp, bp);
+ xfsbdstrat(target->bt_mount, bp);
error = xfs_buf_iowait(bp);
- if (error || bp->b_error) {
+ if (error) {
xfs_buf_relse(bp);
return NULL;
}
@@ -699,7 +696,7 @@ xfs_buf_read_uncached(
void
xfs_buf_set_empty(
struct xfs_buf *bp,
- size_t len)
+ size_t numblks)
{
if (bp->b_pages)
_xfs_buf_free_pages(bp);
@@ -707,10 +704,9 @@ xfs_buf_set_empty(
bp->b_pages = NULL;
bp->b_page_count = 0;
bp->b_addr = NULL;
- bp->b_file_offset = 0;
- bp->b_buffer_length = bp->b_count_desired = len;
+ bp->b_length = numblks;
+ bp->b_io_length = numblks;
bp->b_bn = XFS_BUF_DADDR_NULL;
- bp->b_flags &= ~XBF_MAPPED;
}
static inline struct page *
@@ -749,7 +745,7 @@ xfs_buf_associate_memory(
bp->b_pages = NULL;
bp->b_addr = mem;
- rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
+ rval = _xfs_buf_get_pages(bp, page_count, 0);
if (rval)
return rval;
@@ -760,9 +756,8 @@ xfs_buf_associate_memory(
pageaddr += PAGE_SIZE;
}
- bp->b_count_desired = len;
- bp->b_buffer_length = buflen;
- bp->b_flags |= XBF_MAPPED;
+ bp->b_io_length = BTOBB(len);
+ bp->b_length = BTOBB(buflen);
return 0;
}
@@ -770,17 +765,18 @@ xfs_buf_associate_memory(
xfs_buf_t *
xfs_buf_get_uncached(
struct xfs_buftarg *target,
- size_t len,
+ size_t numblks,
int flags)
{
- unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
+ unsigned long page_count;
int error, i;
xfs_buf_t *bp;
- bp = xfs_buf_alloc(target, 0, len, 0);
+ bp = xfs_buf_alloc(target, 0, numblks, 0);
if (unlikely(bp == NULL))
goto fail;
+ page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
error = _xfs_buf_get_pages(bp, page_count, 0);
if (error)
goto fail_free_buf;
@@ -792,7 +788,7 @@ xfs_buf_get_uncached(
}
bp->b_flags |= _XBF_PAGES;
- error = _xfs_buf_map_pages(bp, XBF_MAPPED);
+ error = _xfs_buf_map_pages(bp, 0);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
"%s: failed to map pages\n", __func__);
@@ -855,7 +851,7 @@ xfs_buf_rele(
spin_unlock(&pag->pag_buf_lock);
} else {
xfs_buf_lru_del(bp);
- ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
@@ -915,13 +911,6 @@ xfs_buf_lock(
trace_xfs_buf_lock_done(bp, _RET_IP_);
}
-/*
- * Releases the lock on the buffer object.
- * If the buffer is marked delwri but is not queued, do so before we
- * unlock the buffer as we need to set flags correctly. We also need to
- * take a reference for the delwri queue because the unlocker is going to
- * drop their's and they don't know we just queued it.
- */
void
xfs_buf_unlock(
struct xfs_buf *bp)
@@ -1008,9 +997,8 @@ xfs_buf_ioerror_alert(
const char *func)
{
xfs_alert(bp->b_target->bt_mount,
-"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
- (__uint64_t)XFS_BUF_ADDR(bp), func,
- bp->b_error, XFS_BUF_COUNT(bp));
+"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
+ (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
}
int
@@ -1019,10 +1007,11 @@ xfs_bwrite(
{
int error;
+ ASSERT(xfs_buf_islocked(bp));
+
bp->b_flags |= XBF_WRITE;
- bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
- xfs_buf_delwri_dequeue(bp);
xfs_bdstrat_cb(bp);
error = xfs_buf_iowait(bp);
@@ -1181,7 +1170,7 @@ _xfs_buf_ioapply(
int rw, map_i, total_nr_pages, nr_pages;
struct bio *bio;
int offset = bp->b_offset;
- int size = bp->b_count_desired;
+ int size = BBTOB(bp->b_io_length);
sector_t sector = bp->b_bn;
total_nr_pages = bp->b_page_count;
@@ -1229,7 +1218,7 @@ next_chunk:
break;
offset = 0;
- sector += nbytes >> BBSHIFT;
+ sector += BTOBB(nbytes);
size -= nbytes;
total_nr_pages--;
}
@@ -1248,13 +1237,13 @@ next_chunk:
}
}
-int
+void
xfs_buf_iorequest(
xfs_buf_t *bp)
{
trace_xfs_buf_iorequest(bp, _RET_IP_);
- ASSERT(!(bp->b_flags & XBF_DELWRI));
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
if (bp->b_flags & XBF_WRITE)
xfs_buf_wait_unpin(bp);
@@ -1269,13 +1258,12 @@ xfs_buf_iorequest(
_xfs_buf_ioend(bp, 0);
xfs_buf_rele(bp);
- return 0;
}
/*
- * Waits for I/O to complete on the buffer supplied.
- * It returns immediately if no I/O is pending.
- * It returns the I/O error code, if any, or 0 if there was no error.
+ * Waits for I/O to complete on the buffer supplied. It returns immediately if
+ * no I/O is pending or there is already a pending error on the buffer. It
+ * returns the I/O error code, if any, or 0 if there was no error.
*/
int
xfs_buf_iowait(
@@ -1283,7 +1271,8 @@ xfs_buf_iowait(
{
trace_xfs_buf_iowait(bp, _RET_IP_);
- wait_for_completion(&bp->b_iowait);
+ if (!bp->b_error)
+ wait_for_completion(&bp->b_iowait);
trace_xfs_buf_iowait_done(bp, _RET_IP_);
return bp->b_error;
@@ -1296,7 +1285,7 @@ xfs_buf_offset(
{
struct page *page;
- if (bp->b_flags & XBF_MAPPED)
+ if (bp->b_addr)
return bp->b_addr + offset;
offset += bp->b_offset;
@@ -1315,27 +1304,30 @@ xfs_buf_iomove(
void *data, /* data address */
xfs_buf_rw_t mode) /* read/write/zero flag */
{
- size_t bend, cpoff, csize;
- struct page *page;
+ size_t bend;
bend = boff + bsize;
while (boff < bend) {
- page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
- cpoff = xfs_buf_poff(boff + bp->b_offset);
- csize = min_t(size_t,
- PAGE_SIZE-cpoff, bp->b_count_desired-boff);
+ struct page *page;
+ int page_index, page_offset, csize;
+
+ page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
+ page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
+ page = bp->b_pages[page_index];
+ csize = min_t(size_t, PAGE_SIZE - page_offset,
+ BBTOB(bp->b_io_length) - boff);
- ASSERT(((csize + cpoff) <= PAGE_SIZE));
+ ASSERT((csize + page_offset) <= PAGE_SIZE);
switch (mode) {
case XBRW_ZERO:
- memset(page_address(page) + cpoff, 0, csize);
+ memset(page_address(page) + page_offset, 0, csize);
break;
case XBRW_READ:
- memcpy(data, page_address(page) + cpoff, csize);
+ memcpy(data, page_address(page) + page_offset, csize);
break;
case XBRW_WRITE:
- memcpy(page_address(page) + cpoff, data, csize);
+ memcpy(page_address(page) + page_offset, data, csize);
}
boff += csize;
@@ -1435,11 +1427,9 @@ xfs_free_buftarg(
{
unregister_shrinker(&btp->bt_shrinker);
- xfs_flush_buftarg(btp, 1);
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_blkdev_issue_flush(btp);
- kthread_stop(btp->bt_task);
kmem_free(btp);
}
@@ -1491,20 +1481,6 @@ xfs_setsize_buftarg(
return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
}
-STATIC int
-xfs_alloc_delwri_queue(
- xfs_buftarg_t *btp,
- const char *fsname)
-{
- INIT_LIST_HEAD(&btp->bt_delwri_queue);
- spin_lock_init(&btp->bt_delwri_lock);
- btp->bt_flags = 0;
- btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
- if (IS_ERR(btp->bt_task))
- return PTR_ERR(btp->bt_task);
- return 0;
-}
-
xfs_buftarg_t *
xfs_alloc_buftarg(
struct xfs_mount *mp,
@@ -1527,8 +1503,6 @@ xfs_alloc_buftarg(
spin_lock_init(&btp->bt_lru_lock);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
- if (xfs_alloc_delwri_queue(btp, fsname))
- goto error;
btp->bt_shrinker.shrink = xfs_buftarg_shrink;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
register_shrinker(&btp->bt_shrinker);
@@ -1539,125 +1513,52 @@ error:
return NULL;
}
-
/*
- * Delayed write buffer handling
+ * Add a buffer to the delayed write list.
+ *
+ * This queues a buffer for writeout if it hasn't already been. Note that
+ * neither this routine nor the buffer list submission functions perform
+ * any internal synchronization. It is expected that the lists are thread-local
+ * to the callers.
+ *
+ * Returns true if we queued up the buffer, or false if it already had
+ * been on the buffer list.
*/
-void
+bool
xfs_buf_delwri_queue(
- xfs_buf_t *bp)
+ struct xfs_buf *bp,
+ struct list_head *list)
{
- struct xfs_buftarg *btp = bp->b_target;
-
- trace_xfs_buf_delwri_queue(bp, _RET_IP_);
-
+ ASSERT(xfs_buf_islocked(bp));
ASSERT(!(bp->b_flags & XBF_READ));
- spin_lock(&btp->bt_delwri_lock);
- if (!list_empty(&bp->b_list)) {
- /* if already in the queue, move it to the tail */
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
- list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
- } else {
- /* start xfsbufd as it is about to have something to do */
- if (list_empty(&btp->bt_delwri_queue))
- wake_up_process(bp->b_target->bt_task);
-
- atomic_inc(&bp->b_hold);
- bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
- list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
- }
- bp->b_queuetime = jiffies;
- spin_unlock(&btp->bt_delwri_lock);
-}
-
-void
-xfs_buf_delwri_dequeue(
- xfs_buf_t *bp)
-{
- int dequeued = 0;
-
- spin_lock(&bp->b_target->bt_delwri_lock);
- if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
- list_del_init(&bp->b_list);
- dequeued = 1;
+ /*
+ * If the buffer is already marked delwri it already is queued up
+ * by someone else for imediate writeout. Just ignore it in that
+ * case.
+ */
+ if (bp->b_flags & _XBF_DELWRI_Q) {
+ trace_xfs_buf_delwri_queued(bp, _RET_IP_);
+ return false;
}
- bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
- spin_unlock(&bp->b_target->bt_delwri_lock);
-
- if (dequeued)
- xfs_buf_rele(bp);
-
- trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
-}
-
-/*
- * If a delwri buffer needs to be pushed before it has aged out, then promote
- * it to the head of the delwri queue so that it will be flushed on the next
- * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
- * than the age currently needed to flush the buffer. Hence the next time the
- * xfsbufd sees it is guaranteed to be considered old enough to flush.
- */
-void
-xfs_buf_delwri_promote(
- struct xfs_buf *bp)
-{
- struct xfs_buftarg *btp = bp->b_target;
- long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
- ASSERT(bp->b_flags & XBF_DELWRI);
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+ trace_xfs_buf_delwri_queue(bp, _RET_IP_);
/*
- * Check the buffer age before locking the delayed write queue as we
- * don't need to promote buffers that are already past the flush age.
+ * If a buffer gets written out synchronously or marked stale while it
+ * is on a delwri list we lazily remove it. To do this, the other party
+ * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
+ * It remains referenced and on the list. In a rare corner case it
+ * might get readded to a delwri list after the synchronous writeout, in
+ * which case we need just need to re-add the flag here.
*/
- if (bp->b_queuetime < jiffies - age)
- return;
- bp->b_queuetime = jiffies - age;
- spin_lock(&btp->bt_delwri_lock);
- list_move(&bp->b_list, &btp->bt_delwri_queue);
- spin_unlock(&btp->bt_delwri_lock);
-}
-
-/*
- * Move as many buffers as specified to the supplied list
- * idicating if we skipped any buffers to prevent deadlocks.
- */
-STATIC int
-xfs_buf_delwri_split(
- xfs_buftarg_t *target,
- struct list_head *list,
- unsigned long age)
-{
- xfs_buf_t *bp, *n;
- int skipped = 0;
- int force;
-
- force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
- INIT_LIST_HEAD(list);
- spin_lock(&target->bt_delwri_lock);
- list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
- ASSERT(bp->b_flags & XBF_DELWRI);
-
- if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
- if (!force &&
- time_before(jiffies, bp->b_queuetime + age)) {
- xfs_buf_unlock(bp);
- break;
- }
-
- bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q);
- bp->b_flags |= XBF_WRITE;
- list_move_tail(&bp->b_list, list);
- trace_xfs_buf_delwri_split(bp, _RET_IP_);
- } else
- skipped++;
+ bp->b_flags |= _XBF_DELWRI_Q;
+ if (list_empty(&bp->b_list)) {
+ atomic_inc(&bp->b_hold);
+ list_add_tail(&bp->b_list, list);
}
- spin_unlock(&target->bt_delwri_lock);
- return skipped;
+ return true;
}
/*
@@ -1683,99 +1584,109 @@ xfs_buf_cmp(
return 0;
}
-STATIC int
-xfsbufd(
- void *data)
+static int
+__xfs_buf_delwri_submit(
+ struct list_head *buffer_list,
+ struct list_head *io_list,
+ bool wait)
{
- xfs_buftarg_t *target = (xfs_buftarg_t *)data;
-
- current->flags |= PF_MEMALLOC;
-
- set_freezable();
+ struct blk_plug plug;
+ struct xfs_buf *bp, *n;
+ int pinned = 0;
+
+ list_for_each_entry_safe(bp, n, buffer_list, b_list) {
+ if (!wait) {
+ if (xfs_buf_ispinned(bp)) {
+ pinned++;
+ continue;
+ }
+ if (!xfs_buf_trylock(bp))
+ continue;
+ } else {
+ xfs_buf_lock(bp);
+ }
- do {
- long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
- long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
- struct list_head tmp;
- struct blk_plug plug;
+ /*
+ * Someone else might have written the buffer synchronously or
+ * marked it stale in the meantime. In that case only the
+ * _XBF_DELWRI_Q flag got cleared, and we have to drop the
+ * reference and remove it from the list here.
+ */
+ if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+ list_del_init(&bp->b_list);
+ xfs_buf_relse(bp);
+ continue;
+ }
- if (unlikely(freezing(current)))
- try_to_freeze();
+ list_move_tail(&bp->b_list, io_list);
+ trace_xfs_buf_delwri_split(bp, _RET_IP_);
+ }
- /* sleep for a long time if there is nothing to do. */
- if (list_empty(&target->bt_delwri_queue))
- tout = MAX_SCHEDULE_TIMEOUT;
- schedule_timeout_interruptible(tout);
+ list_sort(NULL, io_list, xfs_buf_cmp);
- xfs_buf_delwri_split(target, &tmp, age);
- list_sort(NULL, &tmp, xfs_buf_cmp);
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(bp, n, io_list, b_list) {
+ bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
+ bp->b_flags |= XBF_WRITE;
- blk_start_plug(&plug);
- while (!list_empty(&tmp)) {
- struct xfs_buf *bp;
- bp = list_first_entry(&tmp, struct xfs_buf, b_list);
+ if (!wait) {
+ bp->b_flags |= XBF_ASYNC;
list_del_init(&bp->b_list);
- xfs_bdstrat_cb(bp);
}
- blk_finish_plug(&plug);
- } while (!kthread_should_stop());
+ xfs_bdstrat_cb(bp);
+ }
+ blk_finish_plug(&plug);
- return 0;
+ return pinned;
}
/*
- * Go through all incore buffers, and release buffers if they belong to
- * the given device. This is used in filesystem error handling to
- * preserve the consistency of its metadata.
+ * Write out a buffer list asynchronously.
+ *
+ * This will take the @buffer_list, write all non-locked and non-pinned buffers
+ * out and not wait for I/O completion on any of the buffers. This interface
+ * is only safely useable for callers that can track I/O completion by higher
+ * level means, e.g. AIL pushing as the @buffer_list is consumed in this
+ * function.
*/
int
-xfs_flush_buftarg(
- xfs_buftarg_t *target,
- int wait)
+xfs_buf_delwri_submit_nowait(
+ struct list_head *buffer_list)
{
- xfs_buf_t *bp;
- int pincount = 0;
- LIST_HEAD(tmp_list);
- LIST_HEAD(wait_list);
- struct blk_plug plug;
+ LIST_HEAD (io_list);
+ return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+}
- flush_workqueue(xfslogd_workqueue);
+/*
+ * Write out a buffer list synchronously.
+ *
+ * This will take the @buffer_list, write all buffers out and wait for I/O
+ * completion on all of the buffers. @buffer_list is consumed by the function,
+ * so callers must have some other way of tracking buffers if they require such
+ * functionality.
+ */
+int
+xfs_buf_delwri_submit(
+ struct list_head *buffer_list)
+{
+ LIST_HEAD (io_list);
+ int error = 0, error2;
+ struct xfs_buf *bp;
- set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
- pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
+ __xfs_buf_delwri_submit(buffer_list, &io_list, true);
- /*
- * Dropped the delayed write list lock, now walk the temporary list.
- * All I/O is issued async and then if we need to wait for completion
- * we do that after issuing all the IO.
- */
- list_sort(NULL, &tmp_list, xfs_buf_cmp);
+ /* Wait for IO to complete. */
+ while (!list_empty(&io_list)) {
+ bp = list_first_entry(&io_list, struct xfs_buf, b_list);
- blk_start_plug(&plug);
- while (!list_empty(&tmp_list)) {
- bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
- ASSERT(target == bp->b_target);
list_del_init(&bp->b_list);
- if (wait) {
- bp->b_flags &= ~XBF_ASYNC;
- list_add(&bp->b_list, &wait_list);
- }
- xfs_bdstrat_cb(bp);
- }
- blk_finish_plug(&plug);
-
- if (wait) {
- /* Wait for IO to complete. */
- while (!list_empty(&wait_list)) {
- bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
-
- list_del_init(&bp->b_list);
- xfs_buf_iowait(bp);
- xfs_buf_relse(bp);
- }
+ error2 = xfs_buf_iowait(bp);
+ xfs_buf_relse(bp);
+ if (!error)
+ error = error2;
}
- return pincount;
+ return error;
}
int __init
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5bf3be45f54..7f1d1392ce3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -32,11 +32,6 @@
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
-#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
-
typedef enum {
XBRW_READ = 1, /* transfer into target memory */
XBRW_WRITE = 2, /* transfer from target memory */
@@ -46,11 +41,9 @@ typedef enum {
#define XBF_READ (1 << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
-#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
+#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
/* I/O hints for the BIO layer */
#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
@@ -58,14 +51,13 @@ typedef enum {
#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
/* flags used only as arguments to access routines */
-#define XBF_LOCK (1 << 15)/* lock requested */
#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
-#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */
+#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
/* flags used only internally */
#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */
+#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
typedef unsigned int xfs_buf_flags_t;
@@ -73,25 +65,18 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_READ, "READ" }, \
{ XBF_WRITE, "WRITE" }, \
{ XBF_READ_AHEAD, "READ_AHEAD" }, \
- { XBF_MAPPED, "MAPPED" }, \
{ XBF_ASYNC, "ASYNC" }, \
{ XBF_DONE, "DONE" }, \
- { XBF_DELWRI, "DELWRI" }, \
{ XBF_STALE, "STALE" }, \
{ XBF_SYNCIO, "SYNCIO" }, \
{ XBF_FUA, "FUA" }, \
{ XBF_FLUSH, "FLUSH" }, \
- { XBF_LOCK, "LOCK" }, /* should never be set */\
- { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
- { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
+ { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
+ { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }
-typedef enum {
- XBT_FORCE_FLUSH = 0,
-} xfs_buftarg_flags_t;
-
typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
@@ -101,12 +86,6 @@ typedef struct xfs_buftarg {
unsigned int bt_sshift;
size_t bt_smask;
- /* per device delwri queue */
- struct task_struct *bt_task;
- struct list_head bt_delwri_queue;
- spinlock_t bt_delwri_lock;
- unsigned long bt_flags;
-
/* LRU control structures */
struct shrinker bt_shrinker;
struct list_head bt_lru;
@@ -128,8 +107,8 @@ typedef struct xfs_buf {
* fast-path on locking.
*/
struct rb_node b_rbnode; /* rbtree node */
- xfs_off_t b_file_offset; /* offset in file */
- size_t b_buffer_length;/* size of buffer in bytes */
+ xfs_daddr_t b_bn; /* block number for I/O */
+ int b_length; /* size of buffer in BBs */
atomic_t b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
@@ -140,8 +119,6 @@ typedef struct xfs_buf {
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
xfs_buftarg_t *b_target; /* buffer target (device) */
- xfs_daddr_t b_bn; /* block number for I/O */
- size_t b_count_desired;/* desired transfer size */
void *b_addr; /* virtual address of buffer */
struct work_struct b_iodone_work;
xfs_buf_iodone_t b_iodone; /* I/O completion function */
@@ -150,7 +127,7 @@ typedef struct xfs_buf {
struct xfs_trans *b_transp;
struct page **b_pages; /* array of page pointers */
struct page *b_page_array[XB_PAGES]; /* inline pages */
- unsigned long b_queuetime; /* time buffer was queued */
+ int b_io_length; /* IO size in BBs */
atomic_t b_pin_count; /* pin count */
atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
@@ -163,26 +140,30 @@ typedef struct xfs_buf {
/* Finding and Reading Buffers */
-extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t, xfs_buf_t *);
+struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks, xfs_buf_flags_t flags,
+ struct xfs_buf *new_bp);
#define xfs_incore(buftarg,blkno,len,lockit) \
_xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
-extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t);
-extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t);
-
-struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t,
- xfs_buf_flags_t);
-extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
-extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
-extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
-extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
-struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
- struct xfs_buftarg *target,
- xfs_daddr_t daddr, size_t length, int flags);
+struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks, xfs_buf_flags_t flags);
+struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks, xfs_buf_flags_t flags);
+void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks);
+
+struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
+struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks, xfs_buf_flags_t flags);
+void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
+int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
+
+struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+ int flags);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
+ xfs_daddr_t daddr, size_t numblks, int flags);
+void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
extern void xfs_buf_free(xfs_buf_t *);
@@ -204,7 +185,7 @@ extern int xfs_bdstrat_cb(struct xfs_buf *);
extern void xfs_buf_ioend(xfs_buf_t *, int);
extern void xfs_buf_ioerror(xfs_buf_t *, int);
extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
-extern int xfs_buf_iorequest(xfs_buf_t *);
+extern void xfs_buf_iorequest(xfs_buf_t *);
extern int xfs_buf_iowait(xfs_buf_t *);
extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
xfs_buf_rw_t);
@@ -220,24 +201,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
/* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_queue(struct xfs_buf *);
-extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
-extern void xfs_buf_delwri_promote(struct xfs_buf *);
+extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+extern int xfs_buf_delwri_submit(struct list_head *);
+extern int xfs_buf_delwri_submit_nowait(struct list_head *);
/* Buffer Daemon Setup Routines */
extern int xfs_buf_init(void);
extern void xfs_buf_terminate(void);
#define XFS_BUF_ZEROFLAGS(bp) \
- ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
+ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
-#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
-
#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
@@ -256,12 +235,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_ADDR(bp) ((bp)->b_bn)
#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
-#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset)
-#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off))
-#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired)
-#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt))
-#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
-#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
@@ -287,7 +260,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
-extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index eac97ef81e2..45df2b857d4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -123,11 +122,11 @@ xfs_buf_item_log_check(
ASSERT(bip->bli_logged != NULL);
bp = bip->bli_buf;
- ASSERT(XFS_BUF_COUNT(bp) > 0);
+ ASSERT(bp->b_length > 0);
ASSERT(bp->b_addr != NULL);
orig = bip->bli_orig;
buffer = bp->b_addr;
- for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
+ for (x = 0; x < BBTOB(bp->b_length); x++) {
if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
xfs_emerg(bp->b_mount,
"%s: bip %x buffer %x orig %x index %d",
@@ -418,7 +417,6 @@ xfs_buf_item_unpin(
if (freed && stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
- ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
ASSERT(XFS_BUF_ISSTALE(bp));
ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
@@ -455,42 +453,42 @@ xfs_buf_item_unpin(
bp->b_iodone = NULL;
} else {
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
ASSERT(bp->b_fspriv == NULL);
}
xfs_buf_relse(bp);
+ } else if (freed && remove) {
+ xfs_buf_lock(bp);
+ xfs_buf_ioerror(bp, EIO);
+ XFS_BUF_UNDONE(bp);
+ xfs_buf_stale(bp);
+ xfs_buf_ioend(bp, 0);
}
}
-/*
- * This is called to attempt to lock the buffer associated with this
- * buf log item. Don't sleep on the buffer lock. If we can't get
- * the lock right away, return 0. If we can get the lock, take a
- * reference to the buffer. If this is a delayed write buffer that
- * needs AIL help to be written back, invoke the pushbuf routine
- * rather than the normal success path.
- */
STATIC uint
-xfs_buf_item_trylock(
- struct xfs_log_item *lip)
+xfs_buf_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
+ uint rval = XFS_ITEM_SUCCESS;
if (xfs_buf_ispinned(bp))
return XFS_ITEM_PINNED;
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
- /* take a reference to the buffer. */
- xfs_buf_hold(bp);
-
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- trace_xfs_buf_item_trylock(bip);
- if (XFS_BUF_ISDELAYWRITE(bp))
- return XFS_ITEM_PUSHBUF;
- return XFS_ITEM_SUCCESS;
+
+ trace_xfs_buf_item_push(bip);
+
+ if (!xfs_buf_delwri_queue(bp, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+ xfs_buf_unlock(bp);
+ return rval;
}
/*
@@ -603,49 +601,6 @@ xfs_buf_item_committed(
return lsn;
}
-/*
- * The buffer is locked, but is not a delayed write buffer. This happens
- * if we race with IO completion and hence we don't want to try to write it
- * again. Just release the buffer.
- */
-STATIC void
-xfs_buf_item_push(
- struct xfs_log_item *lip)
-{
- struct xfs_buf_log_item *bip = BUF_ITEM(lip);
- struct xfs_buf *bp = bip->bli_buf;
-
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
-
- trace_xfs_buf_item_push(bip);
-
- xfs_buf_relse(bp);
-}
-
-/*
- * The buffer is locked and is a delayed write buffer. Promote the buffer
- * in the delayed write queue as the caller knows that they must invoke
- * the xfsbufd to get this buffer written. We have to unlock the buffer
- * to allow the xfsbufd to write it, too.
- */
-STATIC bool
-xfs_buf_item_pushbuf(
- struct xfs_log_item *lip)
-{
- struct xfs_buf_log_item *bip = BUF_ITEM(lip);
- struct xfs_buf *bp = bip->bli_buf;
-
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(XFS_BUF_ISDELAYWRITE(bp));
-
- trace_xfs_buf_item_pushbuf(bip);
-
- xfs_buf_delwri_promote(bp);
- xfs_buf_relse(bp);
- return true;
-}
-
STATIC void
xfs_buf_item_committing(
struct xfs_log_item *lip,
@@ -661,11 +616,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_format = xfs_buf_item_format,
.iop_pin = xfs_buf_item_pin,
.iop_unpin = xfs_buf_item_unpin,
- .iop_trylock = xfs_buf_item_trylock,
.iop_unlock = xfs_buf_item_unlock,
.iop_committed = xfs_buf_item_committed,
.iop_push = xfs_buf_item_push,
- .iop_pushbuf = xfs_buf_item_pushbuf,
.iop_committing = xfs_buf_item_committing
};
@@ -703,7 +656,8 @@ xfs_buf_item_init(
* truncate any pieces. map_size is the size of the
* bitmap needed to describe the chunks of the buffer.
*/
- chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
+ chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >>
+ XFS_BLF_SHIFT);
map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -713,7 +667,7 @@ xfs_buf_item_init(
xfs_buf_hold(bp);
bip->bli_format.blf_type = XFS_LI_BUF;
bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
- bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
+ bip->bli_format.blf_len = (ushort)bp->b_length;
bip->bli_format.blf_map_size = map_size;
#ifdef XFS_TRANS_DEBUG
@@ -725,9 +679,9 @@ xfs_buf_item_init(
* the buffer to indicate which bytes the callers have asked
* to have logged.
*/
- bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
- memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp));
- bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
+ bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
+ memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
+ bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
#endif
/*
@@ -984,20 +938,27 @@ xfs_buf_iodone_callbacks(
* If the write was asynchronous then no one will be looking for the
* error. Clear the error state and write the buffer out again.
*
- * During sync or umount we'll write all pending buffers again
- * synchronous, which will catch these errors if they keep hanging
- * around.
+ * XXX: This helps against transient write errors, but we need to find
+ * a way to shut the filesystem down if the writes keep failing.
+ *
+ * In practice we'll shut the filesystem down soon as non-transient
+ * erorrs tend to affect the whole device and a failing log write
+ * will make us give up. But we really ought to do better here.
*/
if (XFS_BUF_ISASYNC(bp)) {
+ ASSERT(bp->b_iodone != NULL);
+
+ trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+
xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
if (!XFS_BUF_ISSTALE(bp)) {
- xfs_buf_delwri_queue(bp);
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
+ xfs_bdstrat_cb(bp);
+ } else {
+ xfs_buf_relse(bp);
}
- ASSERT(bp->b_iodone != NULL);
- trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
- xfs_buf_relse(bp);
+
return;
}
@@ -1045,6 +1006,6 @@ xfs_buf_iodone(
* Either way, AIL is useless if we're forcing a shutdown.
*/
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_delete(ailp, lip);
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_item_free(BUF_ITEM(lip));
}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7f1a6f5b05a..015b946c580 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -2277,20 +2276,20 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
if (nbuf == 1) {
dabuf->nbuf = 1;
bp = bps[0];
- dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
+ dabuf->bbcount = bp->b_length;
dabuf->data = bp->b_addr;
dabuf->bps[0] = bp;
} else {
dabuf->nbuf = nbuf;
for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
dabuf->bps[i] = bp = bps[i];
- dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
+ dabuf->bbcount += bp->b_length;
}
dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
- for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+ for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) {
bp = bps[i];
memcpy((char *)dabuf->data + off, bp->b_addr,
- XFS_BUF_COUNT(bp));
+ BBTOB(bp->b_length));
}
}
return dabuf;
@@ -2310,10 +2309,10 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf)
ASSERT(dabuf->nbuf > 1);
dabuf->dirty = 0;
for (i = off = 0; i < dabuf->nbuf;
- i++, off += XFS_BUF_COUNT(bp)) {
+ i++, off += BBTOB(bp->b_length)) {
bp = dabuf->bps[i];
memcpy(bp->b_addr, dabuf->data + off,
- XFS_BUF_COUNT(bp));
+ BBTOB(bp->b_length));
}
}
}
@@ -2356,10 +2355,10 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
}
dabuf->dirty = 1;
ASSERT(first <= last);
- for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+ for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) {
bp = dabuf->bps[i];
f = off;
- l = f + XFS_BUF_COUNT(bp) - 1;
+ l = f + BBTOB(bp->b_length) - 1;
if (f < first)
f = first;
if (l > last)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 1137bbc5ecc..e00de08dc8a 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a2e27010c7f..67a250c36d4 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -18,7 +18,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index d3b63aefd01..586732f2d80 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 5bbe2a8a023..2046988e9eb 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 66e108f561a..397ffbcbab1 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 0179a41d9e5..b0f26780449 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 79d05e84e29..19bf0c5e38f 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 1ad3a4b8ca4..f9c3fe304a1 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -17,7 +17,6 @@
*/
#include "xfs.h"
#include "xfs_sb.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
@@ -30,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_extent_busy.h"
#include "xfs_discard.h"
#include "xfs_trace.h"
@@ -118,7 +118,7 @@ xfs_trim_extents(
* If any blocks in the range are still busy, skip the
* discard and try again the next time.
*/
- if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+ if (xfs_extent_busy_search(mp, agno, fbno, flen)) {
trace_xfs_discard_busy(mp, agno, fbno, flen);
goto next_extent;
}
@@ -212,7 +212,7 @@ xfs_discard_extents(
struct xfs_mount *mp,
struct list_head *list)
{
- struct xfs_busy_extent *busyp;
+ struct xfs_extent_busy *busyp;
int error = 0;
list_for_each_entry(busyp, list, list) {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1155208fa83..bf27fcca484 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -857,7 +856,7 @@ xfs_qm_dqflush_done(
/* xfs_trans_ail_delete() drops the AIL lock. */
spin_lock(&ailp->xa_lock);
if (lip->li_lsn == qip->qli_flush_lsn)
- xfs_trans_ail_delete(ailp, lip);
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
else
spin_unlock(&ailp->xa_lock);
}
@@ -878,8 +877,8 @@ xfs_qm_dqflush_done(
*/
int
xfs_qm_dqflush(
- xfs_dquot_t *dqp,
- uint flags)
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_buf *bp;
@@ -891,25 +890,30 @@ xfs_qm_dqflush(
trace_xfs_dqflush(dqp);
- /*
- * If not dirty, or it's pinned and we are not supposed to block, nada.
- */
- if (!XFS_DQ_IS_DIRTY(dqp) ||
- ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) {
- xfs_dqfunlock(dqp);
- return 0;
- }
+ *bpp = NULL;
+
xfs_qm_dqunpin_wait(dqp);
/*
* This may have been unpinned because the filesystem is shutting
* down forcibly. If that's the case we must not write this dquot
- * to disk, because the log record didn't make it to disk!
+ * to disk, because the log record didn't make it to disk.
+ *
+ * We also have to remove the log item from the AIL in this case,
+ * as we wait for an emptry AIL as part of the unmount process.
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
+ struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
dqp->dq_flags &= ~XFS_DQ_DIRTY;
- xfs_dqfunlock(dqp);
- return XFS_ERROR(EIO);
+
+ spin_lock(&mp->m_ail->xa_lock);
+ if (lip->li_flags & XFS_LI_IN_AIL)
+ xfs_trans_ail_delete(mp->m_ail, lip,
+ SHUTDOWN_CORRUPT_INCORE);
+ else
+ spin_unlock(&mp->m_ail->xa_lock);
+ error = XFS_ERROR(EIO);
+ goto out_unlock;
}
/*
@@ -917,11 +921,8 @@ xfs_qm_dqflush(
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp);
- if (error) {
- ASSERT(error != ENOENT);
- xfs_dqfunlock(dqp);
- return error;
- }
+ if (error)
+ goto out_unlock;
/*
* Calculate the location of the dquot inside the buffer.
@@ -967,20 +968,13 @@ xfs_qm_dqflush(
xfs_log_force(mp, 0);
}
- if (flags & SYNC_WAIT)
- error = xfs_bwrite(bp);
- else
- xfs_buf_delwri_queue(bp);
-
- xfs_buf_relse(bp);
-
trace_xfs_dqflush_done(dqp);
+ *bpp = bp;
+ return 0;
- /*
- * dqp is still locked, but caller is free to unlock it now.
- */
- return error;
-
+out_unlock:
+ xfs_dqfunlock(dqp);
+ return XFS_ERROR(EIO);
}
/*
@@ -1011,39 +1005,6 @@ xfs_dqlock2(
}
}
-/*
- * Give the buffer a little push if it is incore and
- * wait on the flush lock.
- */
-void
-xfs_dqflock_pushbuf_wait(
- xfs_dquot_t *dqp)
-{
- xfs_mount_t *mp = dqp->q_mount;
- xfs_buf_t *bp;
-
- /*
- * Check to see if the dquot has been flushed delayed
- * write. If so, grab its buffer and send it
- * out immediately. We'll be able to acquire
- * the flush lock when the I/O completes.
- */
- bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
- if (!bp)
- goto out_lock;
-
- if (XFS_BUF_ISDELAYWRITE(bp)) {
- if (xfs_buf_ispinned(bp))
- xfs_log_force(mp, 0);
- xfs_buf_delwri_promote(bp);
- wake_up_process(bp->b_target->bt_task);
- }
- xfs_buf_relse(bp);
-out_lock:
- xfs_dqflock(dqp);
-}
-
int __init
xfs_qm_init(void)
{
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index ef9190bd8b3..7d20af27346 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -141,7 +141,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
uint, struct xfs_dquot **);
extern void xfs_qm_dqdestroy(xfs_dquot_t *);
-extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
+extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
xfs_disk_dquot_t *);
@@ -152,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
extern void xfs_qm_dqput(xfs_dquot_t *);
extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
-extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 34baeae4526..57aa4b03720 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -108,38 +106,6 @@ xfs_qm_dquot_logitem_unpin(
wake_up(&dqp->q_pinwait);
}
-/*
- * Given the logitem, this writes the corresponding dquot entry to disk
- * asynchronously. This is called with the dquot entry securely locked;
- * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
- * at the end.
- */
-STATIC void
-xfs_qm_dquot_logitem_push(
- struct xfs_log_item *lip)
-{
- struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
- int error;
-
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(!completion_done(&dqp->q_flush));
-
- /*
- * Since we were able to lock the dquot's flush lock and
- * we found it on the AIL, the dquot must be dirty. This
- * is because the dquot is removed from the AIL while still
- * holding the flush lock in xfs_dqflush_done(). Thus, if
- * we found it in the AIL and were able to obtain the flush
- * lock without sleeping, then there must not have been
- * anyone in the process of flushing the dquot.
- */
- error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK);
- if (error)
- xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
- __func__, error, dqp);
- xfs_dqunlock(dqp);
-}
-
STATIC xfs_lsn_t
xfs_qm_dquot_logitem_committed(
struct xfs_log_item *lip,
@@ -171,67 +137,15 @@ xfs_qm_dqunpin_wait(
wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
}
-/*
- * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
- * the dquot is locked by us, but the flush lock isn't. So, here we are
- * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
- * If so, we want to push it out to help us take this item off the AIL as soon
- * as possible.
- *
- * We must not be holding the AIL lock at this point. Calling incore() to
- * search the buffer cache can be a time consuming thing, and AIL lock is a
- * spinlock.
- */
-STATIC bool
-xfs_qm_dquot_logitem_pushbuf(
- struct xfs_log_item *lip)
-{
- struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
- struct xfs_dquot *dqp = qlip->qli_dquot;
- struct xfs_buf *bp;
- bool ret = true;
-
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
- /*
- * If flushlock isn't locked anymore, chances are that the
- * inode flush completed and the inode was taken off the AIL.
- * So, just get out.
- */
- if (completion_done(&dqp->q_flush) ||
- !(lip->li_flags & XFS_LI_IN_AIL)) {
- xfs_dqunlock(dqp);
- return true;
- }
-
- bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
- dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
- xfs_dqunlock(dqp);
- if (!bp)
- return true;
- if (XFS_BUF_ISDELAYWRITE(bp))
- xfs_buf_delwri_promote(bp);
- if (xfs_buf_ispinned(bp))
- ret = false;
- xfs_buf_relse(bp);
- return ret;
-}
-
-/*
- * This is called to attempt to lock the dquot associated with this
- * dquot log item. Don't sleep on the dquot lock or the flush lock.
- * If the flush lock is already held, indicating that the dquot has
- * been or is in the process of being flushed, then see if we can
- * find the dquot's buffer in the buffer cache without sleeping. If
- * we can and it is marked delayed write, then we want to send it out.
- * We delay doing so until the push routine, though, to avoid sleeping
- * in any device strategy routines.
- */
STATIC uint
-xfs_qm_dquot_logitem_trylock(
- struct xfs_log_item *lip)
+xfs_qm_dquot_logitem_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+ struct xfs_buf *bp = NULL;
+ uint rval = XFS_ITEM_SUCCESS;
+ int error;
if (atomic_read(&dqp->q_pincount) > 0)
return XFS_ITEM_PINNED;
@@ -239,16 +153,41 @@ xfs_qm_dquot_logitem_trylock(
if (!xfs_dqlock_nowait(dqp))
return XFS_ITEM_LOCKED;
+ /*
+ * Re-check the pincount now that we stabilized the value by
+ * taking the quota lock.
+ */
+ if (atomic_read(&dqp->q_pincount) > 0) {
+ rval = XFS_ITEM_PINNED;
+ goto out_unlock;
+ }
+
+ /*
+ * Someone else is already flushing the dquot. Nothing we can do
+ * here but wait for the flush to finish and remove the item from
+ * the AIL.
+ */
if (!xfs_dqflock_nowait(dqp)) {
- /*
- * dquot has already been flushed to the backing buffer,
- * leave it locked, pushbuf routine will unlock it.
- */
- return XFS_ITEM_PUSHBUF;
+ rval = XFS_ITEM_FLUSHING;
+ goto out_unlock;
}
- ASSERT(lip->li_flags & XFS_LI_IN_AIL);
- return XFS_ITEM_SUCCESS;
+ spin_unlock(&lip->li_ailp->xa_lock);
+
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
+ xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
+ __func__, error, dqp);
+ } else {
+ if (!xfs_buf_delwri_queue(bp, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+ xfs_buf_relse(bp);
+ }
+
+ spin_lock(&lip->li_ailp->xa_lock);
+out_unlock:
+ xfs_dqunlock(dqp);
+ return rval;
}
/*
@@ -299,11 +238,9 @@ static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_format = xfs_qm_dquot_logitem_format,
.iop_pin = xfs_qm_dquot_logitem_pin,
.iop_unpin = xfs_qm_dquot_logitem_unpin,
- .iop_trylock = xfs_qm_dquot_logitem_trylock,
.iop_unlock = xfs_qm_dquot_logitem_unlock,
.iop_committed = xfs_qm_dquot_logitem_committed,
.iop_push = xfs_qm_dquot_logitem_push,
- .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf,
.iop_committing = xfs_qm_dquot_logitem_committing
};
@@ -398,11 +335,13 @@ xfs_qm_qoff_logitem_unpin(
}
/*
- * Quotaoff items have no locking, so just return success.
+ * There isn't much you can do to push a quotaoff item. It is simply
+ * stuck waiting for the log to be flushed to disk.
*/
STATIC uint
-xfs_qm_qoff_logitem_trylock(
- struct xfs_log_item *lip)
+xfs_qm_qoff_logitem_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
return XFS_ITEM_LOCKED;
}
@@ -429,17 +368,6 @@ xfs_qm_qoff_logitem_committed(
return lsn;
}
-/*
- * There isn't much you can do to push on an quotaoff item. It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-STATIC void
-xfs_qm_qoff_logitem_push(
- struct xfs_log_item *lip)
-{
-}
-
-
STATIC xfs_lsn_t
xfs_qm_qoffend_logitem_committed(
struct xfs_log_item *lip,
@@ -454,7 +382,7 @@ xfs_qm_qoffend_logitem_committed(
* xfs_trans_ail_delete() drops the AIL lock.
*/
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
+ xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
kmem_free(qfs);
kmem_free(qfe);
@@ -487,7 +415,6 @@ static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_format = xfs_qm_qoff_logitem_format,
.iop_pin = xfs_qm_qoff_logitem_pin,
.iop_unpin = xfs_qm_qoff_logitem_unpin,
- .iop_trylock = xfs_qm_qoff_logitem_trylock,
.iop_unlock = xfs_qm_qoff_logitem_unlock,
.iop_committed = xfs_qm_qoffend_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
@@ -502,7 +429,6 @@ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_format = xfs_qm_qoff_logitem_format,
.iop_pin = xfs_qm_qoff_logitem_pin,
.iop_unpin = xfs_qm_qoff_logitem_unpin,
- .iop_trylock = xfs_qm_qoff_logitem_trylock,
.iop_unlock = xfs_qm_qoff_logitem_unlock,
.iop_committed = xfs_qm_qoff_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 39f06336b99..610456054dc 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 558910f5e3c..2d25d19c4ea 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -17,7 +17,6 @@
*/
#include "xfs.h"
#include "xfs_types.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
new file mode 100644
index 00000000000..85e9f87a1a7
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_inode.h"
+#include "xfs_extent_busy.h"
+#include "xfs_trace.h"
+
+void
+xfs_extent_busy_insert(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ unsigned int flags)
+{
+ struct xfs_extent_busy *new;
+ struct xfs_extent_busy *busyp;
+ struct xfs_perag *pag;
+ struct rb_node **rbp;
+ struct rb_node *parent = NULL;
+
+ new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
+ if (!new) {
+ /*
+ * No Memory! Since it is now not possible to track the free
+ * block, make this a synchronous transaction to insure that
+ * the block is not reused before this transaction commits.
+ */
+ trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
+ xfs_trans_set_sync(tp);
+ return;
+ }
+
+ new->agno = agno;
+ new->bno = bno;
+ new->length = len;
+ INIT_LIST_HEAD(&new->list);
+ new->flags = flags;
+
+ /* trace before insert to be able to see failed inserts */
+ trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
+
+ pag = xfs_perag_get(tp->t_mountp, new->agno);
+ spin_lock(&pag->pagb_lock);
+ rbp = &pag->pagb_tree.rb_node;
+ while (*rbp) {
+ parent = *rbp;
+ busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
+
+ if (new->bno < busyp->bno) {
+ rbp = &(*rbp)->rb_left;
+ ASSERT(new->bno + new->length <= busyp->bno);
+ } else if (new->bno > busyp->bno) {
+ rbp = &(*rbp)->rb_right;
+ ASSERT(bno >= busyp->bno + busyp->length);
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ rb_link_node(&new->rb_node, parent, rbp);
+ rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+ list_add(&new->list, &tp->t_busy);
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate. You need to be holding the busy extent tree lock when calling
+ * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+int
+xfs_extent_busy_search(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
+{
+ struct xfs_perag *pag;
+ struct rb_node *rbp;
+ struct xfs_extent_busy *busyp;
+ int match = 0;
+
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+
+ rbp = pag->pagb_tree.rb_node;
+
+ /* find closest start bno overlap */
+ while (rbp) {
+ busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
+ if (bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ if (bno + len > busyp->bno)
+ match = -1;
+ rbp = rbp->rb_left;
+ } else if (bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ if (bno < busyp->bno + busyp->length)
+ match = -1;
+ rbp = rbp->rb_right;
+ } else {
+ /* bno matches busyp, length determines exact match */
+ match = (busyp->length == len) ? 1 : -1;
+ break;
+ }
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ return match;
+}
+
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent. If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation. We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_extent_busy_update_extent(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_extent_busy *busyp,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata)
+{
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ /*
+ * This extent is currently being discarded. Give the thread
+ * performing the discard a chance to mark the extent unbusy
+ * and retry.
+ */
+ if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
+ spin_unlock(&pag->pagb_lock);
+ delay(1);
+ spin_lock(&pag->pagb_lock);
+ return false;
+ }
+
+ /*
+ * If there is a busy extent overlapping a user allocation, we have
+ * no choice but to force the log and retry the search.
+ *
+ * Fortunately this does not happen during normal operation, but
+ * only if the filesystem is very low on space and has to dip into
+ * the AGFL for normal allocations.
+ */
+ if (userdata)
+ goto out_force_log;
+
+ if (bbno < fbno && bend > fend) {
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ */
+
+ /*
+ * We would have to split the busy extent to be able to track
+ * it correct, which we cannot do because we would have to
+ * modify the list of busy extents attached to the transaction
+ * or CIL context, which is immutable.
+ *
+ * Force out the log to clear the busy extent and retry the
+ * search.
+ */
+ goto out_force_log;
+ } else if (bbno >= fbno && bend <= fend) {
+ /*
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ */
+
+ /*
+ * The busy extent is fully covered by the extent we are
+ * allocating, and can simply be removed from the rbtree.
+ * However we cannot remove it from the immutable list
+ * tracking busy extents in the transaction or CIL context,
+ * so set the length to zero to mark it invalid.
+ *
+ * We also need to restart the busy extent search from the
+ * tree root, because erasing the node can rearrange the
+ * tree topology.
+ */
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ busyp->length = 0;
+ return false;
+ } else if (fend < bend) {
+ /*
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ */
+ busyp->bno = fend;
+ } else if (bbno < fbno) {
+ /*
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ */
+ busyp->length = fbno - busyp->bno;
+ } else {
+ ASSERT(0);
+ }
+
+ trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
+ return true;
+
+out_force_log:
+ spin_unlock(&pag->pagb_lock);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
+ spin_lock(&pag->pagb_lock);
+ return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
+void
+xfs_extent_busy_reuse(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata)
+{
+ struct xfs_perag *pag;
+ struct rb_node *rbp;
+
+ ASSERT(flen > 0);
+
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+restart:
+ rbp = pag->pagb_tree.rb_node;
+ while (rbp) {
+ struct xfs_extent_busy *busyp =
+ rb_entry(rbp, struct xfs_extent_busy, rb_node);
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ if (fbno + flen <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
+
+ if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
+ userdata))
+ goto restart;
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy. If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+void
+xfs_extent_busy_trim(
+ struct xfs_alloc_arg *args,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ xfs_agblock_t *rbno,
+ xfs_extlen_t *rlen)
+{
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ struct rb_node *rbp;
+
+ ASSERT(len > 0);
+
+ spin_lock(&args->pag->pagb_lock);
+restart:
+ fbno = bno;
+ flen = len;
+ rbp = args->pag->pagb_tree.rb_node;
+ while (rbp && flen >= args->minlen) {
+ struct xfs_extent_busy *busyp =
+ rb_entry(rbp, struct xfs_extent_busy, rb_node);
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ if (fend <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
+
+ /*
+ * If this is a metadata allocation, try to reuse the busy
+ * extent instead of trimming the allocation.
+ */
+ if (!args->userdata &&
+ !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
+ if (!xfs_extent_busy_update_extent(args->mp, args->pag,
+ busyp, fbno, flen,
+ false))
+ goto restart;
+ continue;
+ }
+
+ if (bbno <= fbno) {
+ /* start overlap */
+
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * No unbusy region in extent, return failure.
+ */
+ if (fend <= bend)
+ goto fail;
+
+ /*
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ *
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fbno = bend;
+ } else if (bend >= fend) {
+ /* end overlap */
+
+ /*
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fend = bbno;
+ } else {
+ /* middle overlap */
+
+ /*
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ * Can be trimmed to:
+ * +-------+ OR +-------+
+ * fbno fend fbno fend
+ *
+ * Backward allocation leads to significant
+ * fragmentation of directories, which degrades
+ * directory performance, therefore we always want to
+ * choose the option that produces forward allocation
+ * patterns.
+ * Preferring the lower bno extent will make the next
+ * request use "fend" as the start of the next
+ * allocation; if the segment is no longer busy at
+ * that point, we'll get a contiguous allocation, but
+ * even if it is still busy, we will get a forward
+ * allocation.
+ * We try to avoid choosing the segment at "bend",
+ * because that can lead to the next allocation
+ * taking the segment at "fbno", which would be a
+ * backward allocation. We only use the segment at
+ * "fbno" if it is much larger than the current
+ * requested size, because in that case there's a
+ * good chance subsequent allocations will be
+ * contiguous.
+ */
+ if (bbno - fbno >= args->maxlen) {
+ /* left candidate fits perfect */
+ fend = bbno;
+ } else if (fend - bend >= args->maxlen * 4) {
+ /* right candidate has enough free space */
+ fbno = bend;
+ } else if (bbno - fbno >= args->minlen) {
+ /* left candidate fits minimum requirement */
+ fend = bbno;
+ } else {
+ goto fail;
+ }
+ }
+
+ flen = fend - fbno;
+ }
+ spin_unlock(&args->pag->pagb_lock);
+
+ if (fbno != bno || flen != len) {
+ trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
+ fbno, flen);
+ }
+ *rbno = fbno;
+ *rlen = flen;
+ return;
+fail:
+ /*
+ * Return a zero extent length as failure indications. All callers
+ * re-check if the trimmed extent satisfies the minlen requirement.
+ */
+ spin_unlock(&args->pag->pagb_lock);
+ trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+ *rbno = fbno;
+ *rlen = 0;
+}
+
+STATIC void
+xfs_extent_busy_clear_one(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_extent_busy *busyp)
+{
+ if (busyp->length) {
+ trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
+ busyp->length);
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ }
+
+ list_del_init(&busyp->list);
+ kmem_free(busyp);
+}
+
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_extent_busy_clear(
+ struct xfs_mount *mp,
+ struct list_head *list,
+ bool do_discard)
+{
+ struct xfs_extent_busy *busyp, *n;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = NULLAGNUMBER;
+
+ list_for_each_entry_safe(busyp, n, list, list) {
+ if (busyp->agno != agno) {
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ agno = busyp->agno;
+ }
+
+ if (do_discard && busyp->length &&
+ !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
+ busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
+ else
+ xfs_extent_busy_clear_one(mp, pag, busyp);
+ }
+
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_extent_busy_ag_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ return container_of(a, struct xfs_extent_busy, list)->agno -
+ container_of(b, struct xfs_extent_busy, list)->agno;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
new file mode 100644
index 00000000000..985412d65ba
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_EXTENT_BUSY_H__
+#define __XFS_EXTENT_BUSY_H__
+
+/*
+ * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_extent_busy_insert() for details.
+ */
+struct xfs_extent_busy {
+ struct rb_node rb_node; /* ag by-bno indexed search tree */
+ struct list_head list; /* transaction busy extent list */
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t length;
+ unsigned int flags;
+#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
+#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */
+};
+
+void
+xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
+
+void
+xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
+ bool do_discard);
+
+int
+xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+void
+xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
+ xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
+
+int
+xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+
+static inline void xfs_extent_busy_sort(struct list_head *list)
+{
+ list_sort(NULL, list, xfs_extent_busy_ag_cmp);
+}
+
+#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 35c2aff38b2..feb36d7551a 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_sb.h"
@@ -64,7 +63,8 @@ __xfs_efi_release(
if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
spin_lock(&ailp->xa_lock);
/* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, &efip->efi_item);
+ xfs_trans_ail_delete(ailp, &efip->efi_item,
+ SHUTDOWN_LOG_IO_ERROR);
xfs_efi_item_free(efip);
}
}
@@ -147,22 +147,20 @@ xfs_efi_item_unpin(
}
/*
- * Efi items have no locking or pushing. However, since EFIs are
- * pulled from the AIL when their corresponding EFDs are committed
- * to disk, their situation is very similar to being pinned. Return
- * XFS_ITEM_PINNED so that the caller will eventually flush the log.
- * This should help in getting the EFI out of the AIL.
+ * Efi items have no locking or pushing. However, since EFIs are pulled from
+ * the AIL when their corresponding EFDs are committed to disk, their situation
+ * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log. This should help in getting the EFI out of
+ * the AIL.
*/
STATIC uint
-xfs_efi_item_trylock(
- struct xfs_log_item *lip)
+xfs_efi_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
return XFS_ITEM_PINNED;
}
-/*
- * Efi items have no locking, so just return.
- */
STATIC void
xfs_efi_item_unlock(
struct xfs_log_item *lip)
@@ -190,17 +188,6 @@ xfs_efi_item_committed(
}
/*
- * There isn't much you can do to push on an efi item. It is simply
- * stuck waiting for all of its corresponding efd items to be
- * committed to disk.
- */
-STATIC void
-xfs_efi_item_push(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The EFI dependency tracking op doesn't do squat. It can't because
* it doesn't know where the free extent is coming from. The dependency
* tracking has to be handled by the "enclosing" metadata object. For
@@ -222,7 +209,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_format = xfs_efi_item_format,
.iop_pin = xfs_efi_item_pin,
.iop_unpin = xfs_efi_item_unpin,
- .iop_trylock = xfs_efi_item_trylock,
.iop_unlock = xfs_efi_item_unlock,
.iop_committed = xfs_efi_item_committed,
.iop_push = xfs_efi_item_push,
@@ -404,19 +390,17 @@ xfs_efd_item_unpin(
}
/*
- * Efd items have no locking, so just return success.
+ * There isn't much you can do to push on an efd item. It is simply stuck
+ * waiting for the log to be flushed to disk.
*/
STATIC uint
-xfs_efd_item_trylock(
- struct xfs_log_item *lip)
+xfs_efd_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
- return XFS_ITEM_LOCKED;
+ return XFS_ITEM_PINNED;
}
-/*
- * Efd items have no locking or pushing, so return failure
- * so that the caller doesn't bother with us.
- */
STATIC void
xfs_efd_item_unlock(
struct xfs_log_item *lip)
@@ -451,16 +435,6 @@ xfs_efd_item_committed(
}
/*
- * There isn't much you can do to push on an efd item. It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-STATIC void
-xfs_efd_item_push(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The EFD dependency tracking op doesn't do squat. It can't because
* it doesn't know where the free extent is coming from. The dependency
* tracking has to be handled by the "enclosing" metadata object. For
@@ -482,7 +456,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
.iop_format = xfs_efd_item_format,
.iop_pin = xfs_efd_item_pin,
.iop_unpin = xfs_efd_item_unpin,
- .iop_trylock = xfs_efd_item_trylock,
.iop_unlock = xfs_efd_item_unlock,
.iop_committed = xfs_efd_item_committed,
.iop_push = xfs_efd_item_push,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 54a67dd9ac0..8d214b87f6b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_trans.h"
@@ -396,114 +394,96 @@ xfs_file_splice_write(
}
/*
- * This routine is called to handle zeroing any space in the last
- * block of the file that is beyond the EOF. We do this since the
- * size is being increased without writing anything to that block
- * and we don't want anyone to read the garbage on the disk.
+ * This routine is called to handle zeroing any space in the last block of the
+ * file that is beyond the EOF. We do this since the size is being increased
+ * without writing anything to that block and we don't want to read the
+ * garbage on the disk.
*/
STATIC int /* error (positive) */
xfs_zero_last_block(
- xfs_inode_t *ip,
- xfs_fsize_t offset,
- xfs_fsize_t isize)
+ struct xfs_inode *ip,
+ xfs_fsize_t offset,
+ xfs_fsize_t isize)
{
- xfs_fileoff_t last_fsb;
- xfs_mount_t *mp = ip->i_mount;
- int nimaps;
- int zero_offset;
- int zero_len;
- int error = 0;
- xfs_bmbt_irec_t imap;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- zero_offset = XFS_B_FSB_OFFSET(mp, isize);
- if (zero_offset == 0) {
- /*
- * There are no extra bytes in the last block on disk to
- * zero, so return.
- */
- return 0;
- }
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
+ int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+ int zero_len;
+ int nimaps = 1;
+ int error = 0;
+ struct xfs_bmbt_irec imap;
- last_fsb = XFS_B_TO_FSBT(mp, isize);
- nimaps = 1;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
+
ASSERT(nimaps > 0);
+
/*
* If the block underlying isize is just a hole, then there
* is nothing to zero.
*/
- if (imap.br_startblock == HOLESTARTBLOCK) {
+ if (imap.br_startblock == HOLESTARTBLOCK)
return 0;
- }
- /*
- * Zero the part of the last block beyond the EOF, and write it
- * out sync. We need to drop the ilock while we do this so we
- * don't deadlock when the buffer cache calls back to us.
- */
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
zero_len = mp->m_sb.sb_blocksize - zero_offset;
if (isize + zero_len > offset)
zero_len = offset - isize;
- error = xfs_iozero(ip, isize, zero_len);
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ASSERT(error >= 0);
- return error;
+ return xfs_iozero(ip, isize, zero_len);
}
/*
- * Zero any on disk space between the current EOF and the new,
- * larger EOF. This handles the normal case of zeroing the remainder
- * of the last block in the file and the unusual case of zeroing blocks
- * out beyond the size of the file. This second case only happens
- * with fixed size extents and when the system crashes before the inode
- * size was updated but after blocks were allocated. If fill is set,
- * then any holes in the range are filled and zeroed. If not, the holes
- * are left alone as holes.
+ * Zero any on disk space between the current EOF and the new, larger EOF.
+ *
+ * This handles the normal case of zeroing the remainder of the last block in
+ * the file and the unusual case of zeroing blocks out beyond the size of the
+ * file. This second case only happens with fixed size extents and when the
+ * system crashes before the inode size was updated but after blocks were
+ * allocated.
+ *
+ * Expects the iolock to be held exclusive, and will take the ilock internally.
*/
-
int /* error (positive) */
xfs_zero_eof(
- xfs_inode_t *ip,
- xfs_off_t offset, /* starting I/O offset */
- xfs_fsize_t isize) /* current inode size */
+ struct xfs_inode *ip,
+ xfs_off_t offset, /* starting I/O offset */
+ xfs_fsize_t isize) /* current inode size */
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t start_zero_fsb;
- xfs_fileoff_t end_zero_fsb;
- xfs_fileoff_t zero_count_fsb;
- xfs_fileoff_t last_fsb;
- xfs_fileoff_t zero_off;
- xfs_fsize_t zero_len;
- int nimaps;
- int error = 0;
- xfs_bmbt_irec_t imap;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t start_zero_fsb;
+ xfs_fileoff_t end_zero_fsb;
+ xfs_fileoff_t zero_count_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_fileoff_t zero_off;
+ xfs_fsize_t zero_len;
+ int nimaps;
+ int error = 0;
+ struct xfs_bmbt_irec imap;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
/*
* First handle zeroing the block on which isize resides.
+ *
* We only zero a part of that block so it is handled specially.
*/
- error = xfs_zero_last_block(ip, offset, isize);
- if (error) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- return error;
+ if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
+ error = xfs_zero_last_block(ip, offset, isize);
+ if (error)
+ return error;
}
/*
- * Calculate the range between the new size and the old
- * where blocks needing to be zeroed may exist. To get the
- * block where the last byte in the file currently resides,
- * we need to subtract one from the size and truncate back
- * to a block boundary. We subtract 1 in case the size is
- * exactly on a block boundary.
+ * Calculate the range between the new size and the old where blocks
+ * needing to be zeroed may exist.
+ *
+ * To get the block where the last byte in the file currently resides,
+ * we need to subtract one from the size and truncate back to a block
+ * boundary. We subtract 1 in case the size is exactly on a block
+ * boundary.
*/
last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
@@ -521,23 +501,18 @@ xfs_zero_eof(
while (start_zero_fsb <= end_zero_fsb) {
nimaps = 1;
zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
&imap, &nimaps, 0);
- if (error) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
return error;
- }
+
ASSERT(nimaps > 0);
if (imap.br_state == XFS_EXT_UNWRITTEN ||
imap.br_startblock == HOLESTARTBLOCK) {
- /*
- * This loop handles initializing pages that were
- * partially initialized by the code below this
- * loop. It basically zeroes the part of the page
- * that sits on a hole and sets the page as P_HOLE
- * and calls remapf if it is a mapped file.
- */
start_zero_fsb = imap.br_startoff + imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
continue;
@@ -545,11 +520,7 @@ xfs_zero_eof(
/*
* There are blocks we need to zero.
- * Drop the inode lock while we're doing the I/O.
- * We'll still have the iolock to protect us.
*/
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
@@ -557,22 +528,14 @@ xfs_zero_eof(
zero_len = offset - zero_off;
error = xfs_iozero(ip, zero_off, zero_len);
- if (error) {
- goto out_lock;
- }
+ if (error)
+ return error;
start_zero_fsb = imap.br_startoff + imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
}
return 0;
-
-out_lock:
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ASSERT(error >= 0);
- return error;
}
/*
@@ -593,35 +556,29 @@ xfs_file_aio_write_checks(
struct xfs_inode *ip = XFS_I(inode);
int error = 0;
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
restart:
error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
- if (error) {
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
return error;
- }
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
* write. If zeroing is needed and we are currently holding the
- * iolock shared, we need to update it to exclusive which involves
- * dropping all locks and relocking to maintain correct locking order.
- * If we do this, restart the function to ensure all checks and values
- * are still valid.
+ * iolock shared, we need to update it to exclusive which implies
+ * having to redo all checks before.
*/
if (*pos > i_size_read(inode)) {
if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+ xfs_rw_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+ xfs_rw_ilock(ip, *iolock);
goto restart;
}
error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
+ if (error)
+ return error;
}
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
/*
* Updating the timestamps will grab the ilock again from
@@ -638,7 +595,6 @@ restart:
* people from modifying setuid and setgid binaries.
*/
return file_remove_suid(file);
-
}
/*
@@ -1007,8 +963,149 @@ xfs_vm_page_mkwrite(
return block_page_mkwrite(vma, vmf, xfs_get_blocks);
}
+STATIC loff_t
+xfs_seek_data(
+ struct file *file,
+ loff_t start,
+ u32 type)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ loff_t uninitialized_var(offset);
+ xfs_fsize_t isize;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t end;
+ uint lock;
+ int error;
+
+ lock = xfs_ilock_map_shared(ip);
+
+ isize = i_size_read(inode);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+
+ /*
+ * Try to read extents from the first block indicated
+ * by fsbno to the end block of the file.
+ */
+ end = XFS_B_TO_FSB(mp, isize);
+
+ error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Treat unwritten extent as data extent since it might
+ * contains dirty data in page cache.
+ */
+ if (map[0].br_startblock != HOLESTARTBLOCK) {
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ } else {
+ if (nmap == 1) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ }
+
+ if (offset != file->f_pos)
+ file->f_pos = offset;
+
+out_unlock:
+ xfs_iunlock_map_shared(ip, lock);
+
+ if (error)
+ return -error;
+ return offset;
+}
+
+STATIC loff_t
+xfs_seek_hole(
+ struct file *file,
+ loff_t start,
+ u32 type)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ loff_t uninitialized_var(offset);
+ loff_t holeoff;
+ xfs_fsize_t isize;
+ xfs_fileoff_t fsbno;
+ uint lock;
+ int error;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ lock = xfs_ilock_map_shared(ip);
+
+ isize = i_size_read(inode);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ holeoff = XFS_FSB_TO_B(mp, fsbno);
+ if (holeoff <= start)
+ offset = start;
+ else {
+ /*
+ * xfs_bmap_first_unused() could return a value bigger than
+ * isize if there are no more holes past the supplied offset.
+ */
+ offset = min_t(loff_t, holeoff, isize);
+ }
+
+ if (offset != file->f_pos)
+ file->f_pos = offset;
+
+out_unlock:
+ xfs_iunlock_map_shared(ip, lock);
+
+ if (error)
+ return -error;
+ return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+ struct file *file,
+ loff_t offset,
+ int origin)
+{
+ switch (origin) {
+ case SEEK_END:
+ case SEEK_CUR:
+ case SEEK_SET:
+ return generic_file_llseek(file, offset, origin);
+ case SEEK_DATA:
+ return xfs_seek_data(file, offset, origin);
+ case SEEK_HOLE:
+ return xfs_seek_hole(file, offset, origin);
+ default:
+ return -EINVAL;
+ }
+}
+
const struct file_operations xfs_file_operations = {
- .llseek = generic_file_llseek,
+ .llseek = xfs_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = xfs_file_aio_read,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 1c6fdeb702f..c25b094efbf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -18,8 +18,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
@@ -39,7 +37,6 @@
#include "xfs_itable.h"
#include "xfs_trans_space.h"
#include "xfs_rtalloc.h"
-#include "xfs_rw.h"
#include "xfs_filestream.h"
#include "xfs_trace.h"
@@ -147,9 +144,9 @@ xfs_growfs_data_private(
if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
return error;
dpct = pct - mp->m_sb.sb_imax_pct;
- bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ bp = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
- BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+ XFS_FSS_TO_BB(mp, 1), 0);
if (!bp)
return EIO;
xfs_buf_relse(bp);
@@ -193,7 +190,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+ XFS_FSS_TO_BB(mp, 1), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -230,7 +227,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+ XFS_FSS_TO_BB(mp, 1), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -259,8 +256,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize),
- XBF_LOCK | XBF_MAPPED);
+ BTOBB(mp->m_sb.sb_blocksize), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -286,8 +282,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize),
- XBF_LOCK | XBF_MAPPED);
+ BTOBB(mp->m_sb.sb_blocksize), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -314,8 +309,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize),
- XBF_LOCK | XBF_MAPPED);
+ BTOBB(mp->m_sb.sb_blocksize), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -405,7 +399,7 @@ xfs_growfs_data_private(
/* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
- error = xfs_read_buf(mp, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp);
if (error) {
@@ -693,3 +687,63 @@ xfs_fs_goingdown(
return 0;
}
+
+/*
+ * Force a shutdown of the filesystem instantly while keeping the filesystem
+ * consistent. We don't do an unmount here; just shutdown the shop, make sure
+ * that absolutely nothing persistent happens to this filesystem after this
+ * point.
+ */
+void
+xfs_do_force_shutdown(
+ xfs_mount_t *mp,
+ int flags,
+ char *fname,
+ int lnnum)
+{
+ int logerror;
+
+ logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+
+ if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ xfs_notice(mp,
+ "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
+ __func__, flags, lnnum, fname, __return_address);
+ }
+ /*
+ * No need to duplicate efforts.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
+ return;
+
+ /*
+ * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
+ * queue up anybody new on the log reservations, and wakes up
+ * everybody who's sleeping on log reservations to tell them
+ * the bad news.
+ */
+ if (xfs_log_force_umount(mp, logerror))
+ return;
+
+ if (flags & SHUTDOWN_CORRUPT_INCORE) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
+ "Corruption of in-memory data detected. Shutting down filesystem");
+ if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
+ xfs_stack_trace();
+ } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ if (logerror) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+ "Log I/O Error Detected. Shutting down filesystem");
+ } else if (flags & SHUTDOWN_DEVICE_REQ) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "All device paths lost. Shutting down filesystem");
+ } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "I/O Error Detected. Shutting down filesystem");
+ }
+ }
+ if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ xfs_alert(mp,
+ "Please umount the filesystem and rectify the problem(s)");
+ }
+}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index dad1a31aa4f..177a21a7ac4 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,8 +200,7 @@ xfs_ialloc_inode_init(
*/
d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize * blks_per_cluster,
- XBF_LOCK);
+ mp->m_bsize * blks_per_cluster, 0);
if (!fbuf)
return ENOMEM;
/*
@@ -610,6 +609,13 @@ xfs_ialloc_get_rec(
/*
* Visible inode allocation functions.
*/
+/*
+ * Find a free (set) bit in the inode bitmask.
+ */
+static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
+{
+ return xfs_lowbit64(*fp);
+}
/*
* Allocate an inode on disk.
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 666a037398d..65ac57c8063 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -47,15 +47,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
}
/*
- * Find a free (set) bit in the inode bitmask.
- */
-static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
-{
- return xfs_lowbit64(*fp);
-}
-
-
-/*
* Allocate an inode on disk.
* Mode is used to tell whether the new inode will need space, and whether
* it is a directory.
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c6a75815aea..2b8b7a37aa1 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index bcc6c249b2c..1bb4365e8c2 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_acl.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
@@ -123,23 +122,7 @@ xfs_inode_free(
xfs_idestroy_fork(ip, XFS_ATTR_FORK);
if (ip->i_itemp) {
- /*
- * Only if we are shutting down the fs will we see an
- * inode still in the AIL. If it is there, we should remove
- * it to prevent a use-after-free from occurring.
- */
- xfs_log_item_t *lip = &ip->i_itemp->ili_item;
- struct xfs_ail *ailp = lip->li_ailp;
-
- ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
- XFS_FORCED_SHUTDOWN(ip->i_mount));
- if (lip->li_flags & XFS_LI_IN_AIL) {
- spin_lock(&ailp->xa_lock);
- if (lip->li_flags & XFS_LI_IN_AIL)
- xfs_trans_ail_delete(ailp, lip);
- else
- spin_unlock(&ailp->xa_lock);
- }
+ ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
xfs_inode_item_destroy(ip);
ip->i_itemp = NULL;
}
@@ -334,9 +317,10 @@ xfs_iget_cache_miss(
/*
* Preload the radix tree so we can insert safely under the
* write spinlock. Note that we cannot sleep inside the preload
- * region.
+ * region. Since we can be called from transaction context, don't
+ * recurse into the file system.
*/
- if (radix_tree_preload(GFP_KERNEL)) {
+ if (radix_tree_preload(GFP_NOFS)) {
error = EAGAIN;
goto out_destroy;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bc46c0a133d..a59eea09930 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -20,7 +20,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
@@ -61,6 +60,20 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+/*
+ * helper function to extract extent size hint from inode
+ */
+xfs_extlen_t
+xfs_get_extsz_hint(
+ struct xfs_inode *ip)
+{
+ if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+ return ip->i_d.di_extsize;
+ if (XFS_IS_REALTIME_INODE(ip))
+ return ip->i_mount->m_sb.sb_rextsize;
+ return 0;
+}
+
#ifdef DEBUG
/*
* Make sure that the extents in the given memory buffer
@@ -137,6 +150,7 @@ xfs_imap_to_bp(
int ni;
xfs_buf_t *bp;
+ buf_flags |= XBF_UNMAPPED;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
(int)imap->im_len, buf_flags, &bp);
if (error) {
@@ -226,7 +240,7 @@ xfs_inotobp(
if (error)
return error;
- error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
+ error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags);
if (error)
return error;
@@ -782,8 +796,7 @@ xfs_iread(
/*
* Get pointers to the on-disk inode and the buffer containing it.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
- XBF_LOCK, iget_flags);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags);
if (error)
return error;
dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -1342,7 +1355,7 @@ xfs_iunlink(
* Here we put the head pointer into our next pointer,
* and then we fall through to point the head at us.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
if (error)
return error;
@@ -1423,7 +1436,7 @@ xfs_iunlink_remove(
* of dealing with the buffer when there is no need to
* change it.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
if (error) {
xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
__func__, error);
@@ -1484,7 +1497,7 @@ xfs_iunlink_remove(
* Now last_ibp points to the buffer previous to us on
* the unlinked list. Pull us from the list.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
if (error) {
xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
__func__, error);
@@ -1566,8 +1579,7 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * blks_per_cluster,
- XBF_LOCK);
+ mp->m_bsize * blks_per_cluster, 0);
if (!bp)
return ENOMEM;
@@ -1737,7 +1749,7 @@ xfs_ifree(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0);
if (error)
return error;
@@ -2347,11 +2359,11 @@ cluster_corrupt_out:
*/
rcu_read_unlock();
/*
- * Clean up the buffer. If it was B_DELWRI, just release it --
+ * Clean up the buffer. If it was delwri, just release it --
* brelse can handle it with no problems. If not, shut down the
* filesystem before releasing the buffer.
*/
- bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
+ bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
if (bufwasdelwri)
xfs_buf_relse(bp);
@@ -2377,30 +2389,29 @@ cluster_corrupt_out:
/*
* Unlocks the flush lock
*/
- xfs_iflush_abort(iq);
+ xfs_iflush_abort(iq, false);
kmem_free(ilist);
xfs_perag_put(pag);
return XFS_ERROR(EFSCORRUPTED);
}
/*
- * xfs_iflush() will write a modified inode's changes out to the
- * inode's on disk home. The caller must have the inode lock held
- * in at least shared mode and the inode flush completion must be
- * active as well. The inode lock will still be held upon return from
- * the call and the caller is free to unlock it.
- * The inode flush will be completed when the inode reaches the disk.
- * The flags indicate how the inode's buffer should be written out.
+ * Flush dirty inode metadata into the backing buffer.
+ *
+ * The caller must have the inode lock and the inode flush lock held. The
+ * inode lock will still be held upon return to the caller, and the inode
+ * flush lock will be released after the inode has reached the disk.
+ *
+ * The caller must write out the buffer returned in *bpp and release it.
*/
int
xfs_iflush(
- xfs_inode_t *ip,
- uint flags)
+ struct xfs_inode *ip,
+ struct xfs_buf **bpp)
{
- xfs_inode_log_item_t *iip;
- xfs_buf_t *bp;
- xfs_dinode_t *dip;
- xfs_mount_t *mp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buf *bp;
+ struct xfs_dinode *dip;
int error;
XFS_STATS_INC(xs_iflush_count);
@@ -2410,25 +2421,8 @@ xfs_iflush(
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
- iip = ip->i_itemp;
- mp = ip->i_mount;
+ *bpp = NULL;
- /*
- * We can't flush the inode until it is unpinned, so wait for it if we
- * are allowed to block. We know no one new can pin it, because we are
- * holding the inode lock shared and you need to hold it exclusively to
- * pin the inode.
- *
- * If we are not allowed to block, force the log out asynchronously so
- * that when we come back the inode will be unpinned. If other inodes
- * in the same cluster are dirty, they will probably write the inode
- * out for us if they occur after the log force completes.
- */
- if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
- xfs_iunpin(ip);
- xfs_ifunlock(ip);
- return EAGAIN;
- }
xfs_iunpin_wait(ip);
/*
@@ -2447,20 +2441,20 @@ xfs_iflush(
/*
* This may have been unpinned because the filesystem is shutting
* down forcibly. If that's the case we must not write this inode
- * to disk, because the log record didn't make it to disk!
+ * to disk, because the log record didn't make it to disk.
+ *
+ * We also have to remove the log item from the AIL in this case,
+ * as we wait for an empty AIL as part of the unmount process.
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
- if (iip)
- iip->ili_fields = 0;
- xfs_ifunlock(ip);
- return XFS_ERROR(EIO);
+ error = XFS_ERROR(EIO);
+ goto abort_out;
}
/*
* Get the buffer containing the on-disk inode.
*/
- error = xfs_itobp(mp, NULL, ip, &dip, &bp,
- (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
+ error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK);
if (error || !bp) {
xfs_ifunlock(ip);
return error;
@@ -2488,23 +2482,20 @@ xfs_iflush(
if (error)
goto cluster_corrupt_out;
- if (flags & SYNC_WAIT)
- error = xfs_bwrite(bp);
- else
- xfs_buf_delwri_queue(bp);
-
- xfs_buf_relse(bp);
- return error;
+ *bpp = bp;
+ return 0;
corrupt_out:
xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
cluster_corrupt_out:
+ error = XFS_ERROR(EFSCORRUPTED);
+abort_out:
/*
* Unlocks the flush lock
*/
- xfs_iflush_abort(ip);
- return XFS_ERROR(EFSCORRUPTED);
+ xfs_iflush_abort(ip, false);
+ return error;
}
@@ -2706,27 +2697,6 @@ corrupt_out:
return XFS_ERROR(EFSCORRUPTED);
}
-void
-xfs_promote_inode(
- struct xfs_inode *ip)
-{
- struct xfs_buf *bp;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-
- bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
- ip->i_imap.im_len, XBF_TRYLOCK);
- if (!bp)
- return;
-
- if (XFS_BUF_ISDELAYWRITE(bp)) {
- xfs_buf_delwri_promote(bp);
- wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
- }
-
- xfs_buf_relse(bp);
-}
-
/*
* Return a pointer to the extent record at file index idx.
*/
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7fee3387e1c..1efff36a75b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -529,11 +529,12 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
void xfs_iunpin_wait(xfs_inode_t *);
-int xfs_iflush(xfs_inode_t *, uint);
-void xfs_promote_inode(struct xfs_inode *);
+int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
+xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
+
#define IHOLD(ip) \
do { \
ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 05d924efcea..6cdbf90c6f7 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -480,25 +478,16 @@ xfs_inode_item_unpin(
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
-/*
- * This is called to attempt to lock the inode associated with this
- * inode log item, in preparation for the push routine which does the actual
- * iflush. Don't sleep on the inode lock or the flush lock.
- *
- * If the flush lock is already held, indicating that the inode has
- * been or is in the process of being flushed, then (ideally) we'd like to
- * see if the inode's buffer is still incore, and if so give it a nudge.
- * We delay doing so until the pushbuf routine, though, to avoid holding
- * the AIL lock across a call to the blackhole which is the buffer cache.
- * Also we don't want to sleep in any device strategy routines, which can happen
- * if we do the subsequent bawrite in here.
- */
STATIC uint
-xfs_inode_item_trylock(
- struct xfs_log_item *lip)
+xfs_inode_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
+ struct xfs_buf *bp = NULL;
+ uint rval = XFS_ITEM_SUCCESS;
+ int error;
if (xfs_ipincount(ip) > 0)
return XFS_ITEM_PINNED;
@@ -506,30 +495,50 @@ xfs_inode_item_trylock(
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
return XFS_ITEM_LOCKED;
+ /*
+ * Re-check the pincount now that we stabilized the value by
+ * taking the ilock.
+ */
+ if (xfs_ipincount(ip) > 0) {
+ rval = XFS_ITEM_PINNED;
+ goto out_unlock;
+ }
+
+ /*
+ * Someone else is already flushing the inode. Nothing we can do
+ * here but wait for the flush to finish and remove the item from
+ * the AIL.
+ */
if (!xfs_iflock_nowait(ip)) {
- /*
- * inode has already been flushed to the backing buffer,
- * leave it locked in shared mode, pushbuf routine will
- * unlock it.
- */
- return XFS_ITEM_PUSHBUF;
+ rval = XFS_ITEM_FLUSHING;
+ goto out_unlock;
}
- /* Stale items should force out the iclog */
+ /*
+ * Stale inode items should force out the iclog.
+ */
if (ip->i_flags & XFS_ISTALE) {
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return XFS_ITEM_PINNED;
}
-#ifdef DEBUG
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- ASSERT(iip->ili_fields != 0);
- ASSERT(iip->ili_logged == 0);
- ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+ ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
+ ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
+
+ spin_unlock(&lip->li_ailp->xa_lock);
+
+ error = xfs_iflush(ip, &bp);
+ if (!error) {
+ if (!xfs_buf_delwri_queue(bp, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+ xfs_buf_relse(bp);
}
-#endif
- return XFS_ITEM_SUCCESS;
+
+ spin_lock(&lip->li_ailp->xa_lock);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return rval;
}
/*
@@ -614,86 +623,6 @@ xfs_inode_item_committed(
}
/*
- * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
- * failed to get the inode flush lock but did get the inode locked SHARED.
- * Here we're trying to see if the inode buffer is incore, and if so whether it's
- * marked delayed write. If that's the case, we'll promote it and that will
- * allow the caller to write the buffer by triggering the xfsbufd to run.
- */
-STATIC bool
-xfs_inode_item_pushbuf(
- struct xfs_log_item *lip)
-{
- struct xfs_inode_log_item *iip = INODE_ITEM(lip);
- struct xfs_inode *ip = iip->ili_inode;
- struct xfs_buf *bp;
- bool ret = true;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-
- /*
- * If a flush is not in progress anymore, chances are that the
- * inode was taken off the AIL. So, just get out.
- */
- if (!xfs_isiflocked(ip) ||
- !(lip->li_flags & XFS_LI_IN_AIL)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return true;
- }
-
- bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
- iip->ili_format.ilf_len, XBF_TRYLOCK);
-
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (!bp)
- return true;
- if (XFS_BUF_ISDELAYWRITE(bp))
- xfs_buf_delwri_promote(bp);
- if (xfs_buf_ispinned(bp))
- ret = false;
- xfs_buf_relse(bp);
- return ret;
-}
-
-/*
- * This is called to asynchronously write the inode associated with this
- * inode log item out to disk. The inode will already have been locked by
- * a successful call to xfs_inode_item_trylock().
- */
-STATIC void
-xfs_inode_item_push(
- struct xfs_log_item *lip)
-{
- struct xfs_inode_log_item *iip = INODE_ITEM(lip);
- struct xfs_inode *ip = iip->ili_inode;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
- ASSERT(xfs_isiflocked(ip));
-
- /*
- * Since we were able to lock the inode's flush lock and
- * we found it on the AIL, the inode must be dirty. This
- * is because the inode is removed from the AIL while still
- * holding the flush lock in xfs_iflush_done(). Thus, if
- * we found it in the AIL and were able to obtain the flush
- * lock without sleeping, then there must not have been
- * anyone in the process of flushing the inode.
- */
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0);
-
- /*
- * Push the inode to it's backing buffer. This will not remove the
- * inode from the AIL - a further push will be required to trigger a
- * buffer push. However, this allows all the dirty inodes to be pushed
- * to the buffer before it is pushed to disk. The buffer IO completion
- * will pull the inode from the AIL, mark it clean and unlock the flush
- * lock.
- */
- (void) xfs_iflush(ip, SYNC_TRYLOCK);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-}
-
-/*
* XXX rcc - this one really has to do something. Probably needs
* to stamp in a new field in the incore inode.
*/
@@ -713,11 +642,9 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_format = xfs_inode_item_format,
.iop_pin = xfs_inode_item_pin,
.iop_unpin = xfs_inode_item_unpin,
- .iop_trylock = xfs_inode_item_trylock,
.iop_unlock = xfs_inode_item_unlock,
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
- .iop_pushbuf = xfs_inode_item_pushbuf,
.iop_committing = xfs_inode_item_committing
};
@@ -848,7 +775,8 @@ xfs_iflush_done(
ASSERT(i <= need_ail);
}
/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
- xfs_trans_ail_delete_bulk(ailp, log_items, i);
+ xfs_trans_ail_delete_bulk(ailp, log_items, i,
+ SHUTDOWN_CORRUPT_INCORE);
}
@@ -869,16 +797,15 @@ xfs_iflush_done(
}
/*
- * This is the inode flushing abort routine. It is called
- * from xfs_iflush when the filesystem is shutting down to clean
- * up the inode state.
- * It is responsible for removing the inode item
- * from the AIL if it has not been re-logged, and unlocking the inode's
- * flush lock.
+ * This is the inode flushing abort routine. It is called from xfs_iflush when
+ * the filesystem is shutting down to clean up the inode state. It is
+ * responsible for removing the inode item from the AIL if it has not been
+ * re-logged, and unlocking the inode's flush lock.
*/
void
xfs_iflush_abort(
- xfs_inode_t *ip)
+ xfs_inode_t *ip,
+ bool stale)
{
xfs_inode_log_item_t *iip = ip->i_itemp;
@@ -888,7 +815,10 @@ xfs_iflush_abort(
spin_lock(&ailp->xa_lock);
if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
/* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
+ xfs_trans_ail_delete(ailp, &iip->ili_item,
+ stale ?
+ SHUTDOWN_LOG_IO_ERROR :
+ SHUTDOWN_CORRUPT_INCORE);
} else
spin_unlock(&ailp->xa_lock);
}
@@ -915,7 +845,7 @@ xfs_istale_done(
struct xfs_buf *bp,
struct xfs_log_item *lip)
{
- xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
+ xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true);
}
/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 41d61c3b7a3..376d4d0b263 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -165,7 +165,7 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
-extern void xfs_iflush_abort(struct xfs_inode *);
+extern void xfs_iflush_abort(struct xfs_inode *, bool);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
xfs_inode_log_format_t *);
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index b253c0ea5be..90efdaf1706 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -26,11 +26,6 @@
* high agno_log-agblklog-inopblog bits - 0
*/
-typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */
-
-#define NULLFSINO ((xfs_ino_t)-1)
-#define NULLAGINO ((xfs_agino_t)-1)
-
struct xfs_mount;
#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 91f8ff547ab..3a05a41b5d7 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index a849a5473af..c4f2da0d2bf 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -22,9 +22,7 @@
#include <asm/uaccess.h>
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 71a464503c4..aadfce6681e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -37,7 +35,6 @@
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_itable.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_trans_space.h"
@@ -142,11 +139,7 @@ xfs_iomap_write_direct(
int committed;
int error;
- /*
- * Make sure that the dquots are there. This doesn't hold
- * the ilock across a disk read.
- */
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach(ip, 0);
if (error)
return XFS_ERROR(error);
@@ -158,7 +151,7 @@ xfs_iomap_write_direct(
if ((offset + count) > XFS_ISIZE(ip)) {
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
- goto error_out;
+ return XFS_ERROR(error);
} else {
if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -190,7 +183,6 @@ xfs_iomap_write_direct(
/*
* Allocate and setup the transaction
*/
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
error = xfs_trans_reserve(tp, resblks,
XFS_WRITE_LOG_RES(mp), resrtextents,
@@ -199,15 +191,16 @@ xfs_iomap_write_direct(
/*
* Check for running out of space, note: need lock to return
*/
- if (error)
+ if (error) {
xfs_trans_cancel(tp, 0);
+ return XFS_ERROR(error);
+ }
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto error_out;
error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
if (error)
- goto error1;
+ goto out_trans_cancel;
xfs_trans_ijoin(tp, ip, 0);
@@ -224,42 +217,39 @@ xfs_iomap_write_direct(
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
&firstfsb, 0, imap, &nimaps, &free_list);
if (error)
- goto error0;
+ goto out_bmap_cancel;
/*
* Complete the transaction
*/
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
- goto error0;
+ goto out_bmap_cancel;
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
if (error)
- goto error_out;
+ goto out_unlock;
/*
* Copy any maps to caller's array and return any error.
*/
if (nimaps == 0) {
- error = ENOSPC;
- goto error_out;
+ error = XFS_ERROR(ENOSPC);
+ goto out_unlock;
}
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
+ if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
error = xfs_alert_fsblock_zero(ip, imap);
- goto error_out;
- }
- return 0;
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
-error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
-
-error1: /* Just cancel transaction */
+ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
+out_trans_cancel:
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-
-error_out:
- return XFS_ERROR(error);
+ goto out_unlock;
}
/*
@@ -422,6 +412,15 @@ retry:
return error;
}
+ /*
+ * Make sure preallocation does not create extents beyond the range we
+ * actually support in this filesystem.
+ */
+ if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset))
+ last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset);
+
+ ASSERT(last_fsb > offset_fsb);
+
nimaps = XFS_WRITE_IMAPS;
error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
imap, &nimaps, XFS_BMAPI_ENTIRE);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 3011b879f85..1a25fd80279 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_acl.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -34,7 +32,6 @@
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_itable.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_utils.h"
@@ -700,7 +697,7 @@ xfs_setattr_size(
xfs_off_t oldsize, newsize;
struct xfs_trans *tp;
int error;
- uint lock_flags;
+ uint lock_flags = 0;
uint commit_flags = 0;
trace_xfs_setattr(ip);
@@ -720,10 +717,10 @@ xfs_setattr_size(
ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
- lock_flags = XFS_ILOCK_EXCL;
- if (!(flags & XFS_ATTR_NOLOCK))
+ if (!(flags & XFS_ATTR_NOLOCK)) {
lock_flags |= XFS_IOLOCK_EXCL;
- xfs_ilock(ip, lock_flags);
+ xfs_ilock(ip, lock_flags);
+ }
oldsize = inode->i_size;
newsize = iattr->ia_size;
@@ -746,7 +743,7 @@ xfs_setattr_size(
/*
* Make sure that the dquots are attached to the inode.
*/
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach(ip, 0);
if (error)
goto out_unlock;
@@ -768,8 +765,6 @@ xfs_setattr_size(
if (error)
goto out_unlock;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- lock_flags &= ~XFS_ILOCK_EXCL;
/*
* We are going to log the inode size change in this transaction so
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index acc2bf264da..eff577a9b67 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -18,7 +18,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 6db1fef38bf..6b965bf450e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -35,7 +33,6 @@
#include "xfs_trans_priv.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_rw.h"
#include "xfs_trace.h"
kmem_zone_t *xfs_log_ticket_zone;
@@ -916,27 +913,42 @@ xfs_log_need_covered(xfs_mount_t *mp)
* We may be holding the log iclog lock upon entering this routine.
*/
xfs_lsn_t
-xlog_assign_tail_lsn(
+xlog_assign_tail_lsn_locked(
struct xfs_mount *mp)
{
- xfs_lsn_t tail_lsn;
struct log *log = mp->m_log;
+ struct xfs_log_item *lip;
+ xfs_lsn_t tail_lsn;
+
+ assert_spin_locked(&mp->m_ail->xa_lock);
/*
* To make sure we always have a valid LSN for the log tail we keep
* track of the last LSN which was committed in log->l_last_sync_lsn,
- * and use that when the AIL was empty and xfs_ail_min_lsn returns 0.
- *
- * If the AIL has been emptied we also need to wake any process
- * waiting for this condition.
+ * and use that when the AIL was empty.
*/
- tail_lsn = xfs_ail_min_lsn(mp->m_ail);
- if (!tail_lsn)
+ lip = xfs_ail_min(mp->m_ail);
+ if (lip)
+ tail_lsn = lip->li_lsn;
+ else
tail_lsn = atomic64_read(&log->l_last_sync_lsn);
atomic64_set(&log->l_tail_lsn, tail_lsn);
return tail_lsn;
}
+xfs_lsn_t
+xlog_assign_tail_lsn(
+ struct xfs_mount *mp)
+{
+ xfs_lsn_t tail_lsn;
+
+ spin_lock(&mp->m_ail->xa_lock);
+ tail_lsn = xlog_assign_tail_lsn_locked(mp);
+ spin_unlock(&mp->m_ail->xa_lock);
+
+ return tail_lsn;
+}
+
/*
* Return the space in the log between the tail and the head. The head
* is passed in the cycle/bytes formal parms. In the special case where
@@ -1172,7 +1184,7 @@ xlog_alloc_log(xfs_mount_t *mp,
xlog_get_iclog_buffer_size(mp, log);
error = ENOMEM;
- bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0);
+ bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_log;
bp->b_iodone = xlog_iodone;
@@ -1182,9 +1194,6 @@ xlog_alloc_log(xfs_mount_t *mp,
spin_lock_init(&log->l_icloglock);
init_waitqueue_head(&log->l_flush_wait);
- /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
- ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
-
iclogp = &log->l_iclog;
/*
* The amount of memory to allocate for the iclog structure is
@@ -1204,7 +1213,7 @@ xlog_alloc_log(xfs_mount_t *mp,
prev_iclog = iclog;
bp = xfs_buf_get_uncached(mp->m_logdev_targp,
- log->l_iclog_size, 0);
+ BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_iclog;
@@ -1224,7 +1233,7 @@ xlog_alloc_log(xfs_mount_t *mp,
head->h_fmt = cpu_to_be32(XLOG_FMT);
memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
- iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
+ iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
@@ -1475,7 +1484,7 @@ xlog_sync(xlog_t *log,
} else {
iclog->ic_bwritecnt = 1;
}
- XFS_BUF_SET_COUNT(bp, count);
+ bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
XFS_BUF_ZEROFLAGS(bp);
XFS_BUF_ASYNC(bp);
@@ -1573,7 +1582,7 @@ xlog_dealloc_log(xlog_t *log)
* always need to ensure that the extra buffer does not point to memory
* owned by another log buffer before we free it.
*/
- xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+ xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
xfs_buf_free(log->l_xbuf);
iclog = log->l_iclog;
@@ -2932,6 +2941,7 @@ xfs_log_force(
{
int error;
+ trace_xfs_log_force(mp, 0);
error = _xfs_log_force(mp, flags, NULL);
if (error)
xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3080,6 +3090,7 @@ xfs_log_force_lsn(
{
int error;
+ trace_xfs_log_force(mp, lsn);
error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
if (error)
xfs_warn(mp, "%s: error %d returned.", __func__, error);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 2c622bedb30..748d312850e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -152,6 +152,7 @@ int xfs_log_mount(struct xfs_mount *mp,
int num_bblocks);
int xfs_log_mount_finish(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
+xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
int xfs_log_notify(struct xfs_mount *mp,
struct xlog_in_core *iclog,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d4fadbe8ac9..7d6197c5849 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log_priv.h"
@@ -29,61 +27,10 @@
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_discard.h"
/*
- * Perform initial CIL structure initialisation.
- */
-int
-xlog_cil_init(
- struct log *log)
-{
- struct xfs_cil *cil;
- struct xfs_cil_ctx *ctx;
-
- cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
- if (!cil)
- return ENOMEM;
-
- ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
- if (!ctx) {
- kmem_free(cil);
- return ENOMEM;
- }
-
- INIT_LIST_HEAD(&cil->xc_cil);
- INIT_LIST_HEAD(&cil->xc_committing);
- spin_lock_init(&cil->xc_cil_lock);
- init_rwsem(&cil->xc_ctx_lock);
- init_waitqueue_head(&cil->xc_commit_wait);
-
- INIT_LIST_HEAD(&ctx->committing);
- INIT_LIST_HEAD(&ctx->busy_extents);
- ctx->sequence = 1;
- ctx->cil = cil;
- cil->xc_ctx = ctx;
- cil->xc_current_sequence = ctx->sequence;
-
- cil->xc_log = log;
- log->l_cilp = cil;
- return 0;
-}
-
-void
-xlog_cil_destroy(
- struct log *log)
-{
- if (log->l_cilp->xc_ctx) {
- if (log->l_cilp->xc_ctx->ticket)
- xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
- kmem_free(log->l_cilp->xc_ctx);
- }
-
- ASSERT(list_empty(&log->l_cilp->xc_cil));
- kmem_free(log->l_cilp);
-}
-
-/*
* Allocate a new ticket. Failing to get a new ticket makes it really hard to
* recover, so we don't allow failure here. Also, we allocate in a context that
* we don't want to be issuing transactions from, so we need to tell the
@@ -390,8 +337,8 @@ xlog_cil_committed(
xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
ctx->start_lsn, abort);
- xfs_alloc_busy_sort(&ctx->busy_extents);
- xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+ xfs_extent_busy_sort(&ctx->busy_extents);
+ xfs_extent_busy_clear(mp, &ctx->busy_extents,
(mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
spin_lock(&ctx->cil->xc_cil_lock);
@@ -404,7 +351,7 @@ xlog_cil_committed(
ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
xfs_discard_extents(mp, &ctx->busy_extents);
- xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+ xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
}
kmem_free(ctx);
@@ -426,8 +373,7 @@ xlog_cil_committed(
*/
STATIC int
xlog_cil_push(
- struct log *log,
- xfs_lsn_t push_seq)
+ struct log *log)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_vec *lv;
@@ -443,39 +389,36 @@ xlog_cil_push(
struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL };
xfs_lsn_t commit_lsn;
+ xfs_lsn_t push_seq;
if (!cil)
return 0;
- ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
-
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
new_ctx->ticket = xlog_cil_ticket_alloc(log);
- /*
- * Lock out transaction commit, but don't block for background pushes
- * unless we are well over the CIL space limit. See the definition of
- * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
- * used here.
- */
- if (!down_write_trylock(&cil->xc_ctx_lock)) {
- if (!push_seq &&
- cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
- goto out_free_ticket;
- down_write(&cil->xc_ctx_lock);
- }
+ down_write(&cil->xc_ctx_lock);
ctx = cil->xc_ctx;
- /* check if we've anything to push */
- if (list_empty(&cil->xc_cil))
- goto out_skip;
+ spin_lock(&cil->xc_cil_lock);
+ push_seq = cil->xc_push_seq;
+ ASSERT(push_seq <= ctx->sequence);
- /* check for spurious background flush */
- if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ /*
+ * Check if we've anything to push. If there is nothing, then we don't
+ * move on to a new sequence number and so we have to be able to push
+ * this sequence again later.
+ */
+ if (list_empty(&cil->xc_cil)) {
+ cil->xc_push_seq = 0;
+ spin_unlock(&cil->xc_cil_lock);
goto out_skip;
+ }
+ spin_unlock(&cil->xc_cil_lock);
+
/* check for a previously pushed seqeunce */
- if (push_seq && push_seq < cil->xc_ctx->sequence)
+ if (push_seq < cil->xc_ctx->sequence)
goto out_skip;
/*
@@ -629,7 +572,6 @@ restart:
out_skip:
up_write(&cil->xc_ctx_lock);
-out_free_ticket:
xfs_log_ticket_put(new_ctx->ticket);
kmem_free(new_ctx);
return 0;
@@ -641,6 +583,82 @@ out_abort:
return XFS_ERROR(EIO);
}
+static void
+xlog_cil_push_work(
+ struct work_struct *work)
+{
+ struct xfs_cil *cil = container_of(work, struct xfs_cil,
+ xc_push_work);
+ xlog_cil_push(cil->xc_log);
+}
+
+/*
+ * We need to push CIL every so often so we don't cache more than we can fit in
+ * the log. The limit really is that a checkpoint can't be more than half the
+ * log (the current checkpoint is not allowed to overwrite the previous
+ * checkpoint), but commit latency and memory usage limit this to a smaller
+ * size.
+ */
+static void
+xlog_cil_push_background(
+ struct log *log)
+{
+ struct xfs_cil *cil = log->l_cilp;
+
+ /*
+ * The cil won't be empty because we are called while holding the
+ * context lock so whatever we added to the CIL will still be there
+ */
+ ASSERT(!list_empty(&cil->xc_cil));
+
+ /*
+ * don't do a background push if we haven't used up all the
+ * space available yet.
+ */
+ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ return;
+
+ spin_lock(&cil->xc_cil_lock);
+ if (cil->xc_push_seq < cil->xc_current_sequence) {
+ cil->xc_push_seq = cil->xc_current_sequence;
+ queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+ }
+ spin_unlock(&cil->xc_cil_lock);
+
+}
+
+static void
+xlog_cil_push_foreground(
+ struct log *log,
+ xfs_lsn_t push_seq)
+{
+ struct xfs_cil *cil = log->l_cilp;
+
+ if (!cil)
+ return;
+
+ ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
+
+ /* start on any pending background push to minimise wait time on it */
+ flush_work(&cil->xc_push_work);
+
+ /*
+ * If the CIL is empty or we've already pushed the sequence then
+ * there's no work we need to do.
+ */
+ spin_lock(&cil->xc_cil_lock);
+ if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
+ spin_unlock(&cil->xc_cil_lock);
+ return;
+ }
+
+ cil->xc_push_seq = push_seq;
+ spin_unlock(&cil->xc_cil_lock);
+
+ /* do the push now */
+ xlog_cil_push(log);
+}
+
/*
* Commit a transaction with the given vector to the Committed Item List.
*
@@ -667,7 +685,6 @@ xfs_log_commit_cil(
{
struct log *log = mp->m_log;
int log_flags = 0;
- int push = 0;
struct xfs_log_vec *log_vector;
if (flags & XFS_TRANS_RELEASE_LOG_RES)
@@ -719,21 +736,9 @@ xfs_log_commit_cil(
*/
xfs_trans_free_items(tp, *commit_lsn, 0);
- /* check for background commit before unlock */
- if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
- push = 1;
+ xlog_cil_push_background(log);
up_read(&log->l_cilp->xc_ctx_lock);
-
- /*
- * We need to push CIL every so often so we don't cache more than we
- * can fit in the log. The limit really is that a checkpoint can't be
- * more than half the log (the current checkpoint is not allowed to
- * overwrite the previous checkpoint), but commit latency and memory
- * usage limit this to a smaller size in most cases.
- */
- if (push)
- xlog_cil_push(log, 0);
return 0;
}
@@ -746,9 +751,6 @@ xfs_log_commit_cil(
*
* We return the current commit lsn to allow the callers to determine if a
* iclog flush is necessary following this call.
- *
- * XXX: Initially, just push the CIL unconditionally and return whatever
- * commit lsn is there. It'll be empty, so this is broken for now.
*/
xfs_lsn_t
xlog_cil_force_lsn(
@@ -766,8 +768,7 @@ xlog_cil_force_lsn(
* xlog_cil_push() handles racing pushes for the same sequence,
* so no need to deal with it here.
*/
- if (sequence == cil->xc_current_sequence)
- xlog_cil_push(log, sequence);
+ xlog_cil_push_foreground(log, sequence);
/*
* See if we can find a previous sequence still committing.
@@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt(
return false;
return true;
}
+
+/*
+ * Perform initial CIL structure initialisation.
+ */
+int
+xlog_cil_init(
+ struct log *log)
+{
+ struct xfs_cil *cil;
+ struct xfs_cil_ctx *ctx;
+
+ cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+ if (!cil)
+ return ENOMEM;
+
+ ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+ if (!ctx) {
+ kmem_free(cil);
+ return ENOMEM;
+ }
+
+ INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
+ INIT_LIST_HEAD(&cil->xc_cil);
+ INIT_LIST_HEAD(&cil->xc_committing);
+ spin_lock_init(&cil->xc_cil_lock);
+ init_rwsem(&cil->xc_ctx_lock);
+ init_waitqueue_head(&cil->xc_commit_wait);
+
+ INIT_LIST_HEAD(&ctx->committing);
+ INIT_LIST_HEAD(&ctx->busy_extents);
+ ctx->sequence = 1;
+ ctx->cil = cil;
+ cil->xc_ctx = ctx;
+ cil->xc_current_sequence = ctx->sequence;
+
+ cil->xc_log = log;
+ log->l_cilp = cil;
+ return 0;
+}
+
+void
+xlog_cil_destroy(
+ struct log *log)
+{
+ if (log->l_cilp->xc_ctx) {
+ if (log->l_cilp->xc_ctx->ticket)
+ xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+ kmem_free(log->l_cilp->xc_ctx);
+ }
+
+ ASSERT(list_empty(&log->l_cilp->xc_cil));
+ kmem_free(log->l_cilp);
+}
+
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2152900b79d..735ff1ee53d 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -417,6 +417,8 @@ struct xfs_cil {
struct list_head xc_committing;
wait_queue_head_t xc_commit_wait;
xfs_lsn_t xc_current_sequence;
+ struct work_struct xc_push_work;
+ xfs_lsn_t xc_push_seq;
};
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8ecad5bad66..ca386909131 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -40,7 +40,6 @@
#include "xfs_extfree_item.h"
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
-#include "xfs_rw.h"
#include "xfs_utils.h"
#include "xfs_trace.h"
@@ -120,7 +119,7 @@ xlog_get_bp(
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0);
+ bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
if (bp)
xfs_buf_unlock(bp);
return bp;
@@ -146,7 +145,7 @@ xlog_align(
{
xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
- ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(offset + nbblks <= bp->b_length);
return bp->b_addr + BBTOB(offset);
}
@@ -174,11 +173,12 @@ xlog_bread_noalign(
nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
- ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
XFS_BUF_READ(bp);
- XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+ bp->b_io_length = nbblks;
+ bp->b_error = 0;
xfsbdstrat(log->l_mp, bp);
error = xfs_buf_iowait(bp);
@@ -218,7 +218,7 @@ xlog_bread_offset(
xfs_caddr_t offset)
{
xfs_caddr_t orig_offset = bp->b_addr;
- int orig_len = bp->b_buffer_length;
+ int orig_len = BBTOB(bp->b_length);
int error, error2;
error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
@@ -259,13 +259,14 @@ xlog_bwrite(
nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
- ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
XFS_BUF_ZEROFLAGS(bp);
xfs_buf_hold(bp);
xfs_buf_lock(bp);
- XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+ bp->b_io_length = nbblks;
+ bp->b_error = 0;
error = xfs_bwrite(bp);
if (error)
@@ -440,6 +441,8 @@ xlog_find_verify_cycle(
* a log sector, or we're out of luck.
*/
bufblks = 1 << ffs(nbblks);
+ while (bufblks > log->l_logBBsize)
+ bufblks >>= 1;
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < log->l_sectBBsize)
@@ -1225,6 +1228,8 @@ xlog_write_log_records(
* log sector, or we're out of luck.
*/
bufblks = 1 << ffs(blocks);
+ while (bufblks > log->l_logBBsize)
+ bufblks >>= 1;
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < sectbb)
@@ -1772,7 +1777,7 @@ xlog_recover_do_inode_buffer(
trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
- inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+ inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
offsetof(xfs_dinode_t, di_next_unlinked);
@@ -1814,7 +1819,8 @@ xlog_recover_do_inode_buffer(
ASSERT(item->ri_buf[item_index].i_addr != NULL);
ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
- ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
+ ASSERT((reg_buf_offset + reg_buf_bytes) <=
+ BBTOB(bp->b_io_length));
/*
* The current logged region contains a copy of the
@@ -1873,8 +1879,8 @@ xlog_recover_do_reg_buffer(
ASSERT(nbits > 0);
ASSERT(item->ri_buf[i].i_addr != NULL);
ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
- ASSERT(XFS_BUF_COUNT(bp) >=
- ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
+ ASSERT(BBTOB(bp->b_io_length) >=
+ ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
/*
* Do a sanity check if this is a dquot buffer. Just checking
@@ -2103,6 +2109,7 @@ xlog_recover_do_dquot_buffer(
STATIC int
xlog_recover_buffer_pass2(
xlog_t *log,
+ struct list_head *buffer_list,
xlog_recover_item_t *item)
{
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
@@ -2123,9 +2130,9 @@ xlog_recover_buffer_pass2(
trace_xfs_log_recover_buf_recover(log, buf_f);
- buf_flags = XBF_LOCK;
- if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
- buf_flags |= XBF_MAPPED;
+ buf_flags = 0;
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+ buf_flags |= XBF_UNMAPPED;
bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
buf_flags);
@@ -2166,14 +2173,14 @@ xlog_recover_buffer_pass2(
*/
if (XFS_DINODE_MAGIC ==
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
- (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
+ (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
xfs_buf_stale(bp);
error = xfs_bwrite(bp);
} else {
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
}
xfs_buf_relse(bp);
@@ -2183,6 +2190,7 @@ xlog_recover_buffer_pass2(
STATIC int
xlog_recover_inode_pass2(
xlog_t *log,
+ struct list_head *buffer_list,
xlog_recover_item_t *item)
{
xfs_inode_log_format_t *in_f;
@@ -2220,8 +2228,7 @@ xlog_recover_inode_pass2(
}
trace_xfs_log_recover_inode_recover(log, in_f);
- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
- XBF_LOCK);
+ bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
if (!bp) {
error = ENOMEM;
goto error;
@@ -2436,7 +2443,7 @@ xlog_recover_inode_pass2(
write_inode_buffer:
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
error:
if (need_free)
@@ -2477,6 +2484,7 @@ xlog_recover_quotaoff_pass1(
STATIC int
xlog_recover_dquot_pass2(
xlog_t *log,
+ struct list_head *buffer_list,
xlog_recover_item_t *item)
{
xfs_mount_t *mp = log->l_mp;
@@ -2530,14 +2538,11 @@ xlog_recover_dquot_pass2(
return XFS_ERROR(EIO);
ASSERT(dq_f->qlf_len == 1);
- error = xfs_read_buf(mp, mp->m_ddev_targp,
- dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len),
- 0, &bp);
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
+ if (error)
return error;
- }
+
ASSERT(bp);
ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
@@ -2558,7 +2563,7 @@ xlog_recover_dquot_pass2(
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
return (0);
@@ -2642,7 +2647,8 @@ xlog_recover_efd_pass2(
* xfs_trans_ail_delete() drops the
* AIL lock.
*/
- xfs_trans_ail_delete(ailp, lip);
+ xfs_trans_ail_delete(ailp, lip,
+ SHUTDOWN_CORRUPT_INCORE);
xfs_efi_item_free(efip);
spin_lock(&ailp->xa_lock);
break;
@@ -2712,21 +2718,22 @@ STATIC int
xlog_recover_commit_pass2(
struct log *log,
struct xlog_recover *trans,
+ struct list_head *buffer_list,
xlog_recover_item_t *item)
{
trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
- return xlog_recover_buffer_pass2(log, item);
+ return xlog_recover_buffer_pass2(log, buffer_list, item);
case XFS_LI_INODE:
- return xlog_recover_inode_pass2(log, item);
+ return xlog_recover_inode_pass2(log, buffer_list, item);
case XFS_LI_EFI:
return xlog_recover_efi_pass2(log, item, trans->r_lsn);
case XFS_LI_EFD:
return xlog_recover_efd_pass2(log, item);
case XFS_LI_DQUOT:
- return xlog_recover_dquot_pass2(log, item);
+ return xlog_recover_dquot_pass2(log, buffer_list, item);
case XFS_LI_QUOTAOFF:
/* nothing to do in pass2 */
return 0;
@@ -2750,8 +2757,9 @@ xlog_recover_commit_trans(
struct xlog_recover *trans,
int pass)
{
- int error = 0;
+ int error = 0, error2;
xlog_recover_item_t *item;
+ LIST_HEAD (buffer_list);
hlist_del(&trans->r_list);
@@ -2760,16 +2768,27 @@ xlog_recover_commit_trans(
return error;
list_for_each_entry(item, &trans->r_itemq, ri_list) {
- if (pass == XLOG_RECOVER_PASS1)
+ switch (pass) {
+ case XLOG_RECOVER_PASS1:
error = xlog_recover_commit_pass1(log, trans, item);
- else
- error = xlog_recover_commit_pass2(log, trans, item);
+ break;
+ case XLOG_RECOVER_PASS2:
+ error = xlog_recover_commit_pass2(log, trans,
+ &buffer_list, item);
+ break;
+ default:
+ ASSERT(0);
+ }
+
if (error)
- return error;
+ goto out;
}
xlog_recover_free_trans(trans);
- return 0;
+
+out:
+ error2 = xfs_buf_delwri_submit(&buffer_list);
+ return error ? error : error2;
}
STATIC int
@@ -3079,7 +3098,7 @@ xlog_recover_process_one_iunlink(
/*
* Get the on disk inode to find the next inode in the bucket.
*/
- error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0);
if (error)
goto fail_iput;
@@ -3639,11 +3658,8 @@ xlog_do_recover(
* First replay the images in the log.
*/
error = xlog_do_log_recovery(log, head_blk, tail_blk);
- if (error) {
+ if (error)
return error;
- }
-
- xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
/*
* If IO errors happened during recovery, bail out.
@@ -3670,7 +3686,6 @@ xlog_do_recover(
bp = xfs_getsb(log->l_mp, 0);
XFS_BUF_UNDONE(bp);
ASSERT(!(XFS_BUF_ISWRITE(bp)));
- ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
XFS_BUF_READ(bp);
XFS_BUF_UNASYNC(bp);
xfsbdstrat(log->l_mp, bp);
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index bd672def95a..331cd9f83a7 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 1ffead4b229..536021fb3d4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -22,6 +22,7 @@
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
@@ -37,7 +38,6 @@
#include "xfs_rtalloc.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
#include "xfs_utils.h"
@@ -683,8 +683,8 @@ xfs_readsb(xfs_mount_t *mp, int flags)
sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
reread:
- bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
- XFS_SB_DADDR, sector_size, 0);
+ bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
+ BTOBB(sector_size), 0);
if (!bp) {
if (loud)
xfs_warn(mp, "SB buffer read failed");
@@ -1032,9 +1032,9 @@ xfs_check_sizes(xfs_mount_t *mp)
xfs_warn(mp, "filesystem size mismatch detected");
return XFS_ERROR(EFBIG);
}
- bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ bp = xfs_buf_read_uncached(mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
- BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+ XFS_FSS_TO_BB(mp, 1), 0);
if (!bp) {
xfs_warn(mp, "last sector read failed");
return EIO;
@@ -1047,9 +1047,9 @@ xfs_check_sizes(xfs_mount_t *mp)
xfs_warn(mp, "log size mismatch detected");
return XFS_ERROR(EFBIG);
}
- bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
+ bp = xfs_buf_read_uncached(mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_B(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0);
if (!bp) {
xfs_warn(mp, "log device read failed");
return EIO;
@@ -1288,7 +1288,7 @@ xfs_mountfs(
XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
if (error) {
xfs_warn(mp, "log mount failed");
- goto out_free_perag;
+ goto out_fail_wait;
}
/*
@@ -1315,7 +1315,7 @@ xfs_mountfs(
!mp->m_sb.sb_inprogress) {
error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
if (error)
- goto out_free_perag;
+ goto out_fail_wait;
}
/*
@@ -1439,6 +1439,10 @@ xfs_mountfs(
IRELE(rip);
out_log_dealloc:
xfs_log_unmount(mp);
+ out_fail_wait:
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
+ xfs_wait_buftarg(mp->m_logdev_targp);
+ xfs_wait_buftarg(mp->m_ddev_targp);
out_free_perag:
xfs_free_perag(mp);
out_remove_uuid:
@@ -1475,15 +1479,15 @@ xfs_unmountfs(
xfs_log_force(mp, XFS_LOG_SYNC);
/*
- * Do a delwri reclaim pass first so that as many dirty inodes are
- * queued up for IO as possible. Then flush the buffers before making
- * a synchronous path to catch all the remaining inodes are reclaimed.
- * This makes the reclaim process as quick as possible by avoiding
- * synchronous writeout and blocking on inodes already in the delwri
- * state as much as possible.
+ * Flush all pending changes from the AIL.
+ */
+ xfs_ail_push_all_sync(mp->m_ail);
+
+ /*
+ * And reclaim all inodes. At this point there should be no dirty
+ * inode, and none should be pinned or locked, but use synchronous
+ * reclaim just to be sure.
*/
- xfs_reclaim_inodes(mp, 0);
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
xfs_reclaim_inodes(mp, SYNC_WAIT);
xfs_qm_unmount(mp);
@@ -1519,15 +1523,12 @@ xfs_unmountfs(
if (error)
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
- xfs_unmountfs_writesb(mp);
/*
- * Make sure all buffers have been flushed and completed before
- * unmounting the log.
+ * At this point we might have modified the superblock again and thus
+ * added an item to the AIL, thus flush it again.
*/
- error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
- if (error)
- xfs_warn(mp, "%d busy buffers during unmount.", error);
+ xfs_ail_push_all_sync(mp->m_ail);
xfs_wait_buftarg(mp->m_ddev_targp);
xfs_log_unmount_write(mp);
@@ -1588,36 +1589,6 @@ xfs_log_sbcount(xfs_mount_t *mp)
return error;
}
-int
-xfs_unmountfs_writesb(xfs_mount_t *mp)
-{
- xfs_buf_t *sbp;
- int error = 0;
-
- /*
- * skip superblock write if fs is read-only, or
- * if we are doing a forced umount.
- */
- if (!((mp->m_flags & XFS_MOUNT_RDONLY) ||
- XFS_FORCED_SHUTDOWN(mp))) {
-
- sbp = xfs_getsb(mp, 0);
-
- XFS_BUF_UNDONE(sbp);
- XFS_BUF_UNREAD(sbp);
- xfs_buf_delwri_dequeue(sbp);
- XFS_BUF_WRITE(sbp);
- XFS_BUF_UNASYNC(sbp);
- ASSERT(sbp->b_target == mp->m_ddev_targp);
- xfsbdstrat(mp, sbp);
- error = xfs_buf_iowait(sbp);
- if (error)
- xfs_buf_ioerror_alert(sbp, __func__);
- xfs_buf_relse(sbp);
- }
- return error;
-}
-
/*
* xfs_mod_sb() can be used to copy arbitrary changes to the
* in-core superblock into the superblock buffer to be logged.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9eba7388782..8b89c5ac72d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -214,6 +214,7 @@ typedef struct xfs_mount {
struct workqueue_struct *m_data_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
+ struct workqueue_struct *m_cil_workqueue;
} xfs_mount_t;
/*
@@ -378,7 +379,6 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
-extern int xfs_unmountfs_writesb(xfs_mount_t *);
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
uint, int);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 55c6afedc87..249db198776 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -65,7 +64,8 @@ STATIC int
xfs_qm_dquot_walk(
struct xfs_mount *mp,
int type,
- int (*execute)(struct xfs_dquot *dqp))
+ int (*execute)(struct xfs_dquot *dqp, void *data),
+ void *data)
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
@@ -97,7 +97,7 @@ restart:
next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
- error = execute(batch[i]);
+ error = execute(batch[i], data);
if (error == EAGAIN) {
skipped++;
continue;
@@ -129,7 +129,8 @@ restart:
*/
STATIC int
xfs_qm_dqpurge(
- struct xfs_dquot *dqp)
+ struct xfs_dquot *dqp,
+ void *data)
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
@@ -153,21 +154,7 @@ xfs_qm_dqpurge(
dqp->dq_flags |= XFS_DQ_FREEING;
- /*
- * If we're turning off quotas, we have to make sure that, for
- * example, we don't delete quota disk blocks while dquots are
- * in the process of getting written to those disk blocks.
- * This dquot might well be on AIL, and we can't leave it there
- * if we're turning off quotas. Basically, we need this flush
- * lock, and are willing to block on it.
- */
- if (!xfs_dqflock_nowait(dqp)) {
- /*
- * Block on the flush lock after nudging dquot buffer,
- * if it is incore.
- */
- xfs_dqflock_pushbuf_wait(dqp);
- }
+ xfs_dqflock(dqp);
/*
* If we are turning this type of quotas off, we don't care
@@ -175,16 +162,21 @@ xfs_qm_dqpurge(
* we're unmounting, we do care, so we flush it and wait.
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
- int error;
+ struct xfs_buf *bp = NULL;
+ int error;
/*
* We don't care about getting disk errors here. We need
* to purge this dquot anyway, so we go ahead regardless.
*/
- error = xfs_qm_dqflush(dqp, SYNC_WAIT);
- if (error)
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
xfs_warn(mp, "%s: dquot %p flush failed",
__func__, dqp);
+ } else {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ }
xfs_dqflock(dqp);
}
@@ -226,11 +218,11 @@ xfs_qm_dqpurge_all(
uint flags)
{
if (flags & XFS_QMOPT_UQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge);
+ xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
if (flags & XFS_QMOPT_GQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge);
+ xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
if (flags & XFS_QMOPT_PQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge);
+ xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL);
}
/*
@@ -483,6 +475,23 @@ done:
xfs_dqunlock(udq);
}
+static bool
+xfs_qm_need_dqattach(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return false;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return false;
+ if (!XFS_NOT_DQATTACHED(mp, ip))
+ return false;
+ if (ip->i_ino == mp->m_sb.sb_uquotino ||
+ ip->i_ino == mp->m_sb.sb_gquotino)
+ return false;
+ return true;
+}
/*
* Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
@@ -500,11 +509,7 @@ xfs_qm_dqattach_locked(
uint nquotas = 0;
int error = 0;
- if (!XFS_IS_QUOTA_RUNNING(mp) ||
- !XFS_IS_QUOTA_ON(mp) ||
- !XFS_NOT_DQATTACHED(mp, ip) ||
- ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ if (!xfs_qm_need_dqattach(ip))
return 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -575,6 +580,9 @@ xfs_qm_dqattach(
{
int error;
+ if (!xfs_qm_need_dqattach(ip))
+ return 0;
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_qm_dqattach_locked(ip, flags);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -855,15 +863,16 @@ xfs_qm_reset_dqcounts(
STATIC int
xfs_qm_dqiter_bufs(
- xfs_mount_t *mp,
- xfs_dqid_t firstid,
- xfs_fsblock_t bno,
- xfs_filblks_t blkcnt,
- uint flags)
+ struct xfs_mount *mp,
+ xfs_dqid_t firstid,
+ xfs_fsblock_t bno,
+ xfs_filblks_t blkcnt,
+ uint flags,
+ struct list_head *buffer_list)
{
- xfs_buf_t *bp;
- int error;
- int type;
+ struct xfs_buf *bp;
+ int error;
+ int type;
ASSERT(blkcnt > 0);
type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
@@ -887,7 +896,7 @@ xfs_qm_dqiter_bufs(
break;
xfs_qm_reset_dqcounts(mp, bp, firstid, type);
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
/*
* goto the next block.
@@ -895,6 +904,7 @@ xfs_qm_dqiter_bufs(
bno++;
firstid += mp->m_quotainfo->qi_dqperchunk;
}
+
return error;
}
@@ -904,11 +914,12 @@ xfs_qm_dqiter_bufs(
*/
STATIC int
xfs_qm_dqiterate(
- xfs_mount_t *mp,
- xfs_inode_t *qip,
- uint flags)
+ struct xfs_mount *mp,
+ struct xfs_inode *qip,
+ uint flags,
+ struct list_head *buffer_list)
{
- xfs_bmbt_irec_t *map;
+ struct xfs_bmbt_irec *map;
int i, nmaps; /* number of map entries */
int error; /* return value */
xfs_fileoff_t lblkno;
@@ -975,21 +986,17 @@ xfs_qm_dqiterate(
* Iterate thru all the blks in the extent and
* reset the counters of all the dquots inside them.
*/
- if ((error = xfs_qm_dqiter_bufs(mp,
- firstid,
- map[i].br_startblock,
- map[i].br_blockcount,
- flags))) {
- break;
- }
+ error = xfs_qm_dqiter_bufs(mp, firstid,
+ map[i].br_startblock,
+ map[i].br_blockcount,
+ flags, buffer_list);
+ if (error)
+ goto out;
}
-
- if (error)
- break;
} while (nmaps > 0);
+out:
kmem_free(map);
-
return error;
}
@@ -1182,8 +1189,11 @@ error0:
STATIC int
xfs_qm_flush_one(
- struct xfs_dquot *dqp)
+ struct xfs_dquot *dqp,
+ void *data)
{
+ struct list_head *buffer_list = data;
+ struct xfs_buf *bp = NULL;
int error = 0;
xfs_dqlock(dqp);
@@ -1192,11 +1202,13 @@ xfs_qm_flush_one(
if (!XFS_DQ_IS_DIRTY(dqp))
goto out_unlock;
- if (!xfs_dqflock_nowait(dqp))
- xfs_dqflock_pushbuf_wait(dqp);
-
- error = xfs_qm_dqflush(dqp, 0);
+ xfs_dqflock(dqp);
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error)
+ goto out_unlock;
+ xfs_buf_delwri_queue(bp, buffer_list);
+ xfs_buf_relse(bp);
out_unlock:
xfs_dqunlock(dqp);
return error;
@@ -1215,6 +1227,7 @@ xfs_qm_quotacheck(
size_t structsz;
xfs_inode_t *uip, *gip;
uint flags;
+ LIST_HEAD (buffer_list);
count = INT_MAX;
structsz = 1;
@@ -1233,7 +1246,8 @@ xfs_qm_quotacheck(
*/
uip = mp->m_quotainfo->qi_uquotaip;
if (uip) {
- error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+ error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
+ &buffer_list);
if (error)
goto error_return;
flags |= XFS_UQUOTA_CHKD;
@@ -1242,7 +1256,8 @@ xfs_qm_quotacheck(
gip = mp->m_quotainfo->qi_gquotaip;
if (gip) {
error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+ XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
+ &buffer_list);
if (error)
goto error_return;
flags |= XFS_OQUOTA_CHKD;
@@ -1265,19 +1280,27 @@ xfs_qm_quotacheck(
* We've made all the changes that we need to make incore. Flush them
* down to disk buffers if everything was updated successfully.
*/
- if (XFS_IS_UQUOTA_ON(mp))
- error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one);
+ if (XFS_IS_UQUOTA_ON(mp)) {
+ error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one,
+ &buffer_list);
+ }
if (XFS_IS_GQUOTA_ON(mp)) {
- error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one);
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one,
+ &buffer_list);
if (!error)
error = error2;
}
if (XFS_IS_PQUOTA_ON(mp)) {
- error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one);
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one,
+ &buffer_list);
if (!error)
error = error2;
}
+ error2 = xfs_buf_delwri_submit(&buffer_list);
+ if (!error)
+ error = error2;
+
/*
* We can get this error if we couldn't do a dquot allocation inside
* xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
@@ -1291,15 +1314,6 @@ xfs_qm_quotacheck(
}
/*
- * We didn't log anything, because if we crashed, we'll have to
- * start the quotacheck from scratch anyway. However, we must make
- * sure that our dquot changes are secure before we put the
- * quotacheck'd stamp on the superblock. So, here we do a synchronous
- * flush.
- */
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
- /*
* If one type of quotas is off, then it will lose its
* quotachecked status, since we won't be doing accounting for
* that type anymore.
@@ -1308,6 +1322,13 @@ xfs_qm_quotacheck(
mp->m_qflags |= flags;
error_return:
+ while (!list_empty(&buffer_list)) {
+ struct xfs_buf *bp =
+ list_first_entry(&buffer_list, struct xfs_buf, b_list);
+ list_del_init(&bp->b_list);
+ xfs_buf_relse(bp);
+ }
+
if (error) {
xfs_warn(mp,
"Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
@@ -1424,6 +1445,7 @@ xfs_qm_dqfree_one(
STATIC void
xfs_qm_dqreclaim_one(
struct xfs_dquot *dqp,
+ struct list_head *buffer_list,
struct list_head *dispose_list)
{
struct xfs_mount *mp = dqp->q_mount;
@@ -1456,25 +1478,20 @@ xfs_qm_dqreclaim_one(
if (!xfs_dqflock_nowait(dqp))
goto out_busy;
- /*
- * We have the flush lock so we know that this is not in the
- * process of being flushed. So, if this is dirty, flush it
- * DELWRI so that we don't get a freelist infested with
- * dirty dquots.
- */
if (XFS_DQ_IS_DIRTY(dqp)) {
+ struct xfs_buf *bp = NULL;
+
trace_xfs_dqreclaim_dirty(dqp);
- /*
- * We flush it delayed write, so don't bother releasing the
- * freelist lock.
- */
- error = xfs_qm_dqflush(dqp, 0);
+ error = xfs_qm_dqflush(dqp, &bp);
if (error) {
xfs_warn(mp, "%s: dquot %p flush failed",
__func__, dqp);
+ goto out_busy;
}
+ xfs_buf_delwri_queue(bp, buffer_list);
+ xfs_buf_relse(bp);
/*
* Give the dquot another try on the freelist, as the
* flushing will take some time.
@@ -1518,8 +1535,10 @@ xfs_qm_shake(
struct xfs_quotainfo *qi =
container_of(shrink, struct xfs_quotainfo, qi_shrinker);
int nr_to_scan = sc->nr_to_scan;
+ LIST_HEAD (buffer_list);
LIST_HEAD (dispose_list);
struct xfs_dquot *dqp;
+ int error;
if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
return 0;
@@ -1532,15 +1551,20 @@ xfs_qm_shake(
break;
dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
q_lru);
- xfs_qm_dqreclaim_one(dqp, &dispose_list);
+ xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
}
mutex_unlock(&qi->qi_lru_lock);
+ error = xfs_buf_delwri_submit(&buffer_list);
+ if (error)
+ xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
while (!list_empty(&dispose_list)) {
dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
list_del_init(&dqp->q_lru);
xfs_qm_dqfree_one(dqp);
}
+
out:
return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
}
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index e6986b5d80d..6b39115bf14 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index c4f396e437a..858a3b18611 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -22,7 +22,6 @@
#include "xfs_fs.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7e76f537abb..fed504fc299 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -17,7 +17,6 @@
*/
#include "xfs.h"
#include "xfs_sb.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index e44ef7ee8ce..30ff5f401d2 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca4f31534a0..92d4331cd4f 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -34,7 +33,6 @@
#include "xfs_rtalloc.h"
#include "xfs_fsops.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_inode_item.h"
#include "xfs_trans_space.h"
#include "xfs_utils.h"
@@ -1872,9 +1870,9 @@ xfs_growfs_rt(
/*
* Read in the last block of the device, make sure it exists.
*/
- bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+ bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, nrblocks - 1),
- XFS_FSB_TO_B(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0);
if (!bp)
return EIO;
xfs_buf_relse(bp);
@@ -2219,9 +2217,9 @@ xfs_rtmount_init(
(unsigned long long) mp->m_sb.sb_rblocks);
return XFS_ERROR(EFBIG);
}
- bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+ bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_B(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0);
if (!bp) {
xfs_warn(mp, "realtime device size check failed");
return EIO;
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
deleted file mode 100644
index 597d044a09a..00000000000
--- a/fs/xfs/xfs_rw.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-
-/*
- * Force a shutdown of the filesystem instantly while keeping
- * the filesystem consistent. We don't do an unmount here; just shutdown
- * the shop, make sure that absolutely nothing persistent happens to
- * this filesystem after this point.
- */
-void
-xfs_do_force_shutdown(
- xfs_mount_t *mp,
- int flags,
- char *fname,
- int lnnum)
-{
- int logerror;
-
- logerror = flags & SHUTDOWN_LOG_IO_ERROR;
-
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_notice(mp,
- "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
- __func__, flags, lnnum, fname, __return_address);
- }
- /*
- * No need to duplicate efforts.
- */
- if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
- return;
-
- /*
- * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
- * queue up anybody new on the log reservations, and wakes up
- * everybody who's sleeping on log reservations to tell them
- * the bad news.
- */
- if (xfs_log_force_umount(mp, logerror))
- return;
-
- if (flags & SHUTDOWN_CORRUPT_INCORE) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
- "Corruption of in-memory data detected. Shutting down filesystem");
- if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
- xfs_stack_trace();
- } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- if (logerror) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
- "Log I/O Error Detected. Shutting down filesystem");
- } else if (flags & SHUTDOWN_DEVICE_REQ) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "All device paths lost. Shutting down filesystem");
- } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "I/O Error Detected. Shutting down filesystem");
- }
- }
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_alert(mp,
- "Please umount the filesystem and rectify the problem(s)");
- }
-}
-
-/*
- * This isn't an absolute requirement, but it is
- * just a good idea to call xfs_read_buf instead of
- * directly doing a read_buf call. For one, we shouldn't
- * be doing this disk read if we are in SHUTDOWN state anyway,
- * so this stops that from happening. Secondly, this does all
- * the error checking stuff and the brelse if appropriate for
- * the caller, so the code can be a little leaner.
- */
-
-int
-xfs_read_buf(
- struct xfs_mount *mp,
- xfs_buftarg_t *target,
- xfs_daddr_t blkno,
- int len,
- uint flags,
- xfs_buf_t **bpp)
-{
- xfs_buf_t *bp;
- int error;
-
- if (!flags)
- flags = XBF_LOCK | XBF_MAPPED;
-
- bp = xfs_buf_read(target, blkno, len, flags);
- if (!bp)
- return XFS_ERROR(EIO);
- error = bp->b_error;
- if (!error && !XFS_FORCED_SHUTDOWN(mp)) {
- *bpp = bp;
- } else {
- *bpp = NULL;
- if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
- } else {
- error = XFS_ERROR(EIO);
- }
- if (bp) {
- XFS_BUF_UNDONE(bp);
- xfs_buf_stale(bp);
- /*
- * brelse clears B_ERROR and b_error
- */
- xfs_buf_relse(bp);
- }
- }
- return (error);
-}
-
-/*
- * helper function to extract extent size hint from inode
- */
-xfs_extlen_t
-xfs_get_extsz_hint(
- struct xfs_inode *ip)
-{
- if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
- return ip->i_d.di_extsize;
- if (XFS_IS_REALTIME_INODE(ip))
- return ip->i_mount->m_sb.sb_rextsize;
- return 0;
-}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
deleted file mode 100644
index bbdb9ad6a4b..00000000000
--- a/fs/xfs/xfs_rw.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_RW_H__
-#define __XFS_RW_H__
-
-struct xfs_buf;
-struct xfs_inode;
-struct xfs_mount;
-
-/*
- * Convert the given file system block to a disk block.
- * We have to treat it differently based on whether the
- * file is a real time file or not, because the bmap code
- * does.
- */
-static inline xfs_daddr_t
-xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
-{
- return (XFS_IS_REALTIME_INODE(ip) ? \
- (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
- XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
-}
-
-/*
- * Prototypes for functions in xfs_rw.c.
- */
-extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
- xfs_daddr_t blkno, int len, uint flags,
- struct xfs_buf **bpp);
-extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
-
-#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index dab9a5f6dfd..2fcfd5b0b04 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,7 +17,6 @@
*/
#include "xfs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
@@ -622,7 +621,7 @@ void
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
+ blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL);
}
STATIC void
@@ -773,8 +772,14 @@ xfs_init_mount_workqueues(
if (!mp->m_unwritten_workqueue)
goto out_destroy_data_iodone_queue;
+ mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
+ WQ_MEM_RECLAIM, 0, mp->m_fsname);
+ if (!mp->m_cil_workqueue)
+ goto out_destroy_unwritten;
return 0;
+out_destroy_unwritten:
+ destroy_workqueue(mp->m_unwritten_workqueue);
out_destroy_data_iodone_queue:
destroy_workqueue(mp->m_data_workqueue);
out:
@@ -785,6 +790,7 @@ STATIC void
xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
+ destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_data_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
}
@@ -981,18 +987,9 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
- xfs_syncd_stop(mp);
-
- /*
- * Blow away any referenced inode in the filestreams cache.
- * This can and will cause log traffic as inodes go inactive
- * here.
- */
xfs_filestream_unmount(mp);
-
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
xfs_unmountfs(mp);
+ xfs_syncd_stop(mp);
xfs_freesb(mp);
xfs_icsb_destroy_counters(mp);
xfs_destroy_mount_workqueues(mp);
@@ -1072,7 +1069,7 @@ xfs_fs_statfs(
spin_unlock(&mp->m_sb_lock);
- if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
+ if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
(XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
xfs_qm_statvfs(ip, statp);
@@ -1362,31 +1359,32 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
- error = xfs_mountfs(mp);
+ error = xfs_syncd_init(mp);
if (error)
goto out_filestream_unmount;
- error = xfs_syncd_init(mp);
+ error = xfs_mountfs(mp);
if (error)
- goto out_unmount;
+ goto out_syncd_stop;
root = igrab(VFS_I(mp->m_rootip));
if (!root) {
error = ENOENT;
- goto out_syncd_stop;
+ goto out_unmount;
}
if (is_bad_inode(root)) {
error = EINVAL;
- goto out_syncd_stop;
+ goto out_unmount;
}
sb->s_root = d_make_root(root);
if (!sb->s_root) {
error = ENOMEM;
- goto out_syncd_stop;
+ goto out_unmount;
}
return 0;
-
+ out_syncd_stop:
+ xfs_syncd_stop(mp);
out_filestream_unmount:
xfs_filestream_unmount(mp);
out_free_sb:
@@ -1403,19 +1401,10 @@ out_destroy_workqueues:
out:
return -error;
- out_syncd_stop:
- xfs_syncd_stop(mp);
out_unmount:
- /*
- * Blow away any referenced inode in the filestreams cache.
- * This can and will cause log traffic as inodes go inactive
- * here.
- */
xfs_filestream_unmount(mp);
-
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
xfs_unmountfs(mp);
+ xfs_syncd_stop(mp);
goto out_free_sb;
}
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 205ebcb34d9..c9d3409c5ca 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -18,7 +18,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
@@ -241,45 +240,6 @@ xfs_sync_inode_data(
return error;
}
-STATIC int
-xfs_sync_inode_attr(
- struct xfs_inode *ip,
- struct xfs_perag *pag,
- int flags)
-{
- int error = 0;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_inode_clean(ip))
- goto out_unlock;
- if (!xfs_iflock_nowait(ip)) {
- if (!(flags & SYNC_WAIT))
- goto out_unlock;
- xfs_iflock(ip);
- }
-
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- goto out_unlock;
- }
-
- error = xfs_iflush(ip, flags);
-
- /*
- * We don't want to try again on non-blocking flushes that can't run
- * again immediately. If an inode really must be written, then that's
- * what the SYNC_WAIT flag is for.
- */
- if (error == EAGAIN) {
- ASSERT(!(flags & SYNC_WAIT));
- error = 0;
- }
-
- out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return error;
-}
-
/*
* Write out pagecache data for the whole filesystem.
*/
@@ -300,19 +260,6 @@ xfs_sync_data(
return 0;
}
-/*
- * Write out inode metadata (attributes) for the whole filesystem.
- */
-STATIC int
-xfs_sync_attr(
- struct xfs_mount *mp,
- int flags)
-{
- ASSERT((flags & ~SYNC_WAIT) == 0);
-
- return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
-}
-
STATIC int
xfs_sync_fsdata(
struct xfs_mount *mp)
@@ -350,7 +297,7 @@ xfs_sync_fsdata(
* First stage of freeze - no writers will make progress now we are here,
* so we flush delwri and delalloc buffers here, then wait for all I/O to
* complete. Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother flushing the buftarg
+ * transactions can still occur here so don't bother emptying the AIL
* because it'll just get dirty again.
*/
int
@@ -365,47 +312,13 @@ xfs_quiesce_data(
/* write superblock and hoover up shutdown errors */
error = xfs_sync_fsdata(mp);
- /* make sure all delwri buffers are written out */
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
/* mark the log as covered if needed */
if (xfs_log_need_covered(mp))
error2 = xfs_fs_log_dummy(mp);
- /* flush data-only devices */
- if (mp->m_rtdev_targp)
- xfs_flush_buftarg(mp->m_rtdev_targp, 1);
-
return error ? error : error2;
}
-STATIC void
-xfs_quiesce_fs(
- struct xfs_mount *mp)
-{
- int count = 0, pincount;
-
- xfs_reclaim_inodes(mp, 0);
- xfs_flush_buftarg(mp->m_ddev_targp, 0);
-
- /*
- * This loop must run at least twice. The first instance of the loop
- * will flush most meta data but that will generate more meta data
- * (typically directory updates). Which then must be flushed and
- * logged before we can write the unmount record. We also so sync
- * reclaim of inodes to catch any that the above delwri flush skipped.
- */
- do {
- xfs_reclaim_inodes(mp, SYNC_WAIT);
- xfs_sync_attr(mp, SYNC_WAIT);
- pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
- if (!pincount) {
- delay(50);
- count++;
- }
- } while (count < 2);
-}
-
/*
* Second stage of a quiesce. The data is already synced, now we have to take
* care of the metadata. New transactions are already blocked, so we need to
@@ -421,8 +334,12 @@ xfs_quiesce_attr(
while (atomic_read(&mp->m_active_trans) > 0)
delay(100);
- /* flush inodes and push all remaining buffers out to disk */
- xfs_quiesce_fs(mp);
+ /* reclaim inodes to do any IO before the freeze completes */
+ xfs_reclaim_inodes(mp, 0);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+ /* flush all pending changes from the AIL */
+ xfs_ail_push_all_sync(mp->m_ail);
/*
* Just warn here till VFS can correctly support
@@ -436,7 +353,12 @@ xfs_quiesce_attr(
xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
"Frozen image may not be consistent.");
xfs_log_unmount_write(mp);
- xfs_unmountfs_writesb(mp);
+
+ /*
+ * At this point we might have modified the superblock again and thus
+ * added an item to the AIL, thus flush it again.
+ */
+ xfs_ail_push_all_sync(mp->m_ail);
}
static void
@@ -460,16 +382,27 @@ xfs_sync_worker(
struct xfs_mount, m_sync_work);
int error;
- if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
- /* dgc: errors ignored here */
- if (mp->m_super->s_frozen == SB_UNFROZEN &&
- xfs_log_need_covered(mp))
- error = xfs_fs_log_dummy(mp);
- else
- xfs_log_force(mp, 0);
-
- /* start pushing all the metadata that is currently dirty */
- xfs_ail_push_all(mp->m_ail);
+ /*
+ * We shouldn't write/force the log if we are in the mount/unmount
+ * process or on a read only filesystem. The workqueue still needs to be
+ * active in both cases, however, because it is used for inode reclaim
+ * during these times. Use the s_umount semaphore to provide exclusion
+ * with unmount.
+ */
+ if (down_read_trylock(&mp->m_super->s_umount)) {
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ /* dgc: errors ignored here */
+ if (mp->m_super->s_frozen == SB_UNFROZEN &&
+ xfs_log_need_covered(mp))
+ error = xfs_fs_log_dummy(mp);
+ else
+ xfs_log_force(mp, 0);
+
+ /* start pushing all the metadata that is currently
+ * dirty */
+ xfs_ail_push_all(mp->m_ail);
+ }
+ up_read(&mp->m_super->s_umount);
}
/* queue us up again */
@@ -488,14 +421,6 @@ xfs_syncd_queue_reclaim(
struct xfs_mount *mp)
{
- /*
- * We can have inodes enter reclaim after we've shut down the syncd
- * workqueue during unmount, so don't allow reclaim work to be queued
- * during unmount.
- */
- if (!(mp->m_super->s_flags & MS_ACTIVE))
- return;
-
rcu_read_lock();
if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
@@ -564,7 +489,6 @@ xfs_syncd_init(
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
xfs_syncd_queue_sync(mp);
- xfs_syncd_queue_reclaim(mp);
return 0;
}
@@ -702,11 +626,8 @@ xfs_reclaim_inode_grab(
}
/*
- * Inodes in different states need to be treated differently, and the return
- * value of xfs_iflush is not sufficient to get this right. The following table
- * lists the inode states and the reclaim actions necessary for non-blocking
- * reclaim:
- *
+ * Inodes in different states need to be treated differently. The following
+ * table lists the inode states and the reclaim actions necessary:
*
* inode state iflush ret required action
* --------------- ---------- ---------------
@@ -716,39 +637,31 @@ xfs_reclaim_inode_grab(
* stale, unpinned 0 reclaim
* clean, pinned(*) 0 requeue
* stale, pinned EAGAIN requeue
- * dirty, delwri ok 0 requeue
- * dirty, delwri blocked EAGAIN requeue
- * dirty, sync flush 0 reclaim
+ * dirty, async - requeue
+ * dirty, sync 0 reclaim
*
* (*) dgc: I don't think the clean, pinned state is possible but it gets
* handled anyway given the order of checks implemented.
*
- * As can be seen from the table, the return value of xfs_iflush() is not
- * sufficient to correctly decide the reclaim action here. The checks in
- * xfs_iflush() might look like duplicates, but they are not.
- *
* Also, because we get the flush lock first, we know that any inode that has
* been flushed delwri has had the flush completed by the time we check that
- * the inode is clean. The clean inode check needs to be done before flushing
- * the inode delwri otherwise we would loop forever requeuing clean inodes as
- * we cannot tell apart a successful delwri flush and a clean inode from the
- * return value of xfs_iflush().
+ * the inode is clean.
*
- * Note that because the inode is flushed delayed write by background
- * writeback, the flush lock may already be held here and waiting on it can
- * result in very long latencies. Hence for sync reclaims, where we wait on the
- * flush lock, the caller should push out delayed write inodes first before
- * trying to reclaim them to minimise the amount of time spent waiting. For
- * background relaim, we just requeue the inode for the next pass.
+ * Note that because the inode is flushed delayed write by AIL pushing, the
+ * flush lock may already be held here and waiting on it can result in very
+ * long latencies. Hence for sync reclaims, where we wait on the flush lock,
+ * the caller should push the AIL first before trying to reclaim inodes to
+ * minimise the amount of time spent waiting. For background relaim, we only
+ * bother to reclaim clean inodes anyway.
*
* Hence the order of actions after gaining the locks should be:
* bad => reclaim
* shutdown => unpin and reclaim
- * pinned, delwri => requeue
+ * pinned, async => requeue
* pinned, sync => unpin
* stale => reclaim
* clean => reclaim
- * dirty, delwri => flush and requeue
+ * dirty, async => requeue
* dirty, sync => flush, wait and reclaim
*/
STATIC int
@@ -757,7 +670,8 @@ xfs_reclaim_inode(
struct xfs_perag *pag,
int sync_mode)
{
- int error;
+ struct xfs_buf *bp = NULL;
+ int error;
restart:
error = 0;
@@ -765,17 +679,6 @@ restart:
if (!xfs_iflock_nowait(ip)) {
if (!(sync_mode & SYNC_WAIT))
goto out;
-
- /*
- * If we only have a single dirty inode in a cluster there is
- * a fair chance that the AIL push may have pushed it into
- * the buffer, but xfsbufd won't touch it until 30 seconds
- * from now, and thus we will lock up here.
- *
- * Promote the inode buffer to the front of the delwri list
- * and wake up xfsbufd now.
- */
- xfs_promote_inode(ip);
xfs_iflock(ip);
}
@@ -783,13 +686,12 @@ restart:
goto reclaim;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
+ xfs_iflush_abort(ip, false);
goto reclaim;
}
if (xfs_ipincount(ip)) {
- if (!(sync_mode & SYNC_WAIT)) {
- xfs_ifunlock(ip);
- goto out;
- }
+ if (!(sync_mode & SYNC_WAIT))
+ goto out_ifunlock;
xfs_iunpin_wait(ip);
}
if (xfs_iflags_test(ip, XFS_ISTALE))
@@ -798,60 +700,42 @@ restart:
goto reclaim;
/*
+ * Never flush out dirty data during non-blocking reclaim, as it would
+ * just contend with AIL pushing trying to do the same job.
+ */
+ if (!(sync_mode & SYNC_WAIT))
+ goto out_ifunlock;
+
+ /*
* Now we have an inode that needs flushing.
*
- * We do a nonblocking flush here even if we are doing a SYNC_WAIT
- * reclaim as we can deadlock with inode cluster removal.
+ * Note that xfs_iflush will never block on the inode buffer lock, as
* xfs_ifree_cluster() can lock the inode buffer before it locks the
- * ip->i_lock, and we are doing the exact opposite here. As a result,
- * doing a blocking xfs_itobp() to get the cluster buffer will result
+ * ip->i_lock, and we are doing the exact opposite here. As a result,
+ * doing a blocking xfs_itobp() to get the cluster buffer would result
* in an ABBA deadlock with xfs_ifree_cluster().
*
* As xfs_ifree_cluser() must gather all inodes that are active in the
* cache to mark them stale, if we hit this case we don't actually want
* to do IO here - we want the inode marked stale so we can simply
- * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
- * just unlock the inode, back off and try again. Hopefully the next
- * pass through will see the stale flag set on the inode.
+ * reclaim it. Hence if we get an EAGAIN error here, just unlock the
+ * inode, back off and try again. Hopefully the next pass through will
+ * see the stale flag set on the inode.
*/
- error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
- if (sync_mode & SYNC_WAIT) {
- if (error == EAGAIN) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /* backoff longer than in xfs_ifree_cluster */
- delay(2);
- goto restart;
- }
- xfs_iflock(ip);
- goto reclaim;
+ error = xfs_iflush(ip, &bp);
+ if (error == EAGAIN) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* backoff longer than in xfs_ifree_cluster */
+ delay(2);
+ goto restart;
}
- /*
- * When we have to flush an inode but don't have SYNC_WAIT set, we
- * flush the inode out using a delwri buffer and wait for the next
- * call into reclaim to find it in a clean state instead of waiting for
- * it now. We also don't return errors here - if the error is transient
- * then the next reclaim pass will flush the inode, and if the error
- * is permanent then the next sync reclaim will reclaim the inode and
- * pass on the error.
- */
- if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_warn(ip->i_mount,
- "inode 0x%llx background reclaim flush failed with %d",
- (long long)ip->i_ino, error);
+ if (!error) {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
}
-out:
- xfs_iflags_clear(ip, XFS_IRECLAIM);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * We could return EAGAIN here to make reclaim rescan the inode tree in
- * a short while. However, this just burns CPU time scanning the tree
- * waiting for IO to complete and xfssyncd never goes back to the idle
- * state. Instead, return 0 to let the next scheduled background reclaim
- * attempt to reclaim the inode again.
- */
- return 0;
+ xfs_iflock(ip);
reclaim:
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -884,8 +768,21 @@ reclaim:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_inode_free(ip);
-
return error;
+
+out_ifunlock:
+ xfs_ifunlock(ip);
+out:
+ xfs_iflags_clear(ip, XFS_IRECLAIM);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /*
+ * We could return EAGAIN here to make reclaim rescan the inode tree in
+ * a short while. However, this just burns CPU time scanning the tree
+ * waiting for IO to complete and xfssyncd never goes back to the idle
+ * state. Instead, return 0 to let the next scheduled background reclaim
+ * attempt to reclaim the inode again.
+ */
+ return 0;
}
/*
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 9010ce885e6..624bedd8135 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 06838c42b2a..7cf9d3529e5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -281,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_daddr_t, bno)
- __field(size_t, buffer_length)
+ __field(int, nblks)
__field(int, hold)
__field(int, pincount)
__field(unsigned, lockval)
@@ -291,18 +291,18 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
__entry->bno = bp->b_bn;
- __entry->buffer_length = bp->b_buffer_length;
+ __entry->nblks = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
"lock %d flags %s caller %pf",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
- __entry->buffer_length,
+ __entry->nblks,
__entry->hold,
__entry->pincount,
__entry->lockval,
@@ -328,7 +328,7 @@ DEFINE_BUF_EVENT(xfs_buf_unlock);
DEFINE_BUF_EVENT(xfs_buf_iowait);
DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
-DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_bdstrat_shut);
@@ -362,7 +362,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
__entry->bno = bp->b_bn;
- __entry->buffer_length = bp->b_buffer_length;
+ __entry->buffer_length = BBTOB(bp->b_length);
__entry->flags = flags;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
@@ -406,7 +406,7 @@ TRACE_EVENT(xfs_buf_ioerror,
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
__entry->bno = bp->b_bn;
- __entry->buffer_length = bp->b_buffer_length;
+ __entry->buffer_length = BBTOB(bp->b_length);
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
@@ -450,7 +450,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->bli_recur = bip->bli_recur;
__entry->bli_refcount = atomic_read(&bip->bli_refcount);
__entry->buf_bno = bip->bli_buf->b_bn;
- __entry->buf_len = bip->bli_buf->b_buffer_length;
+ __entry->buf_len = BBTOB(bip->bli_buf->b_length);
__entry->buf_flags = bip->bli_buf->b_flags;
__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
@@ -486,12 +486,10 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -876,15 +874,30 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
__print_flags(__entry->flags, "|", XFS_LI_FLAGS))
)
+TRACE_EVENT(xfs_log_force,
+ TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
+ TP_ARGS(mp, lsn),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_lsn_t, lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->lsn = lsn;
+ ),
+ TP_printk("dev %d:%d lsn 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->lsn)
+)
+
#define DEFINE_LOG_ITEM_EVENT(name) \
DEFINE_EVENT(xfs_log_item_class, name, \
TP_PROTO(struct xfs_log_item *lip), \
TP_ARGS(lip))
DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
-DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
-DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
DECLARE_EVENT_CLASS(xfs_file_class,
@@ -1145,7 +1158,7 @@ TRACE_EVENT(xfs_bunmap,
);
-DECLARE_EVENT_CLASS(xfs_busy_class,
+DECLARE_EVENT_CLASS(xfs_extent_busy_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len),
TP_ARGS(mp, agno, agbno, len),
@@ -1168,17 +1181,17 @@ DECLARE_EVENT_CLASS(xfs_busy_class,
__entry->len)
);
#define DEFINE_BUSY_EVENT(name) \
-DEFINE_EVENT(xfs_busy_class, name, \
+DEFINE_EVENT(xfs_extent_busy_class, name, \
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
xfs_agblock_t agbno, xfs_extlen_t len), \
TP_ARGS(mp, agno, agbno, len))
-DEFINE_BUSY_EVENT(xfs_alloc_busy);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
+DEFINE_BUSY_EVENT(xfs_extent_busy);
+DEFINE_BUSY_EVENT(xfs_extent_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_extent_busy_force);
+DEFINE_BUSY_EVENT(xfs_extent_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_extent_busy_clear);
-TRACE_EVENT(xfs_alloc_busy_trim,
+TRACE_EVENT(xfs_extent_busy_trim,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len,
xfs_agblock_t tbno, xfs_extlen_t tlen),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 103b00c9000..cdf896fcbfa 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -19,9 +19,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -36,6 +34,7 @@
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_bmap.h"
#include "xfs_quota.h"
#include "xfs_trans_priv.h"
@@ -608,8 +607,8 @@ STATIC void
xfs_trans_free(
struct xfs_trans *tp)
{
- xfs_alloc_busy_sort(&tp->t_busy);
- xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
+ xfs_extent_busy_sort(&tp->t_busy);
+ xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
atomic_dec(&tp->t_mountp->m_active_trans);
xfs_trans_free_dqinfo(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f6118703f20..7ab99e1898c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -345,11 +345,9 @@ struct xfs_item_ops {
void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
void (*iop_pin)(xfs_log_item_t *);
void (*iop_unpin)(xfs_log_item_t *, int remove);
- uint (*iop_trylock)(xfs_log_item_t *);
+ uint (*iop_push)(struct xfs_log_item *, struct list_head *);
void (*iop_unlock)(xfs_log_item_t *);
xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
- void (*iop_push)(xfs_log_item_t *);
- bool (*iop_pushbuf)(xfs_log_item_t *);
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
};
@@ -357,20 +355,18 @@ struct xfs_item_ops {
#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
-#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list)
#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
-#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
/*
- * Return values for the IOP_TRYLOCK() routines.
+ * Return values for the IOP_PUSH() routines.
*/
-#define XFS_ITEM_SUCCESS 0
-#define XFS_ITEM_PINNED 1
-#define XFS_ITEM_LOCKED 2
-#define XFS_ITEM_PUSHBUF 3
+#define XFS_ITEM_SUCCESS 0
+#define XFS_ITEM_PINNED 1
+#define XFS_ITEM_LOCKED 2
+#define XFS_ITEM_FLUSHING 3
/*
* This is the type of function which can be given to xfs_trans_callback()
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1dead07f092..9c514483e59 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -20,7 +20,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -79,7 +78,7 @@ xfs_ail_check(
* Return a pointer to the first item in the AIL. If the AIL is empty, then
* return NULL.
*/
-static xfs_log_item_t *
+xfs_log_item_t *
xfs_ail_min(
struct xfs_ail *ailp)
{
@@ -364,30 +363,31 @@ xfsaild_push(
xfs_log_item_t *lip;
xfs_lsn_t lsn;
xfs_lsn_t target;
- long tout = 10;
+ long tout;
int stuck = 0;
+ int flushing = 0;
int count = 0;
- int push_xfsbufd = 0;
/*
- * If last time we ran we encountered pinned items, force the log first
- * and wait for it before pushing again.
+ * If we encountered pinned items or did not finish writing out all
+ * buffers the last time we ran, force the log first and wait for it
+ * before pushing again.
*/
- spin_lock(&ailp->xa_lock);
- if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush &&
- !list_empty(&ailp->xa_ail)) {
+ if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 &&
+ (!list_empty_careful(&ailp->xa_buf_list) ||
+ xfs_ail_min_lsn(ailp))) {
ailp->xa_log_flush = 0;
- spin_unlock(&ailp->xa_lock);
+
XFS_STATS_INC(xs_push_ail_flush);
xfs_log_force(mp, XFS_LOG_SYNC);
- spin_lock(&ailp->xa_lock);
}
- target = ailp->xa_target;
+ spin_lock(&ailp->xa_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
- if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
+ if (!lip) {
/*
- * AIL is empty or our push has reached the end.
+ * If the AIL is empty or our push has reached the end we are
+ * done now.
*/
xfs_trans_ail_cursor_done(ailp, &cur);
spin_unlock(&ailp->xa_lock);
@@ -396,54 +396,42 @@ xfsaild_push(
XFS_STATS_INC(xs_push_ail);
- /*
- * While the item we are looking at is below the given threshold
- * try to flush it out. We'd like not to stop until we've at least
- * tried to push on everything in the AIL with an LSN less than
- * the given threshold.
- *
- * However, we will stop after a certain number of pushes and wait
- * for a reduced timeout to fire before pushing further. This
- * prevents use from spinning when we can't do anything or there is
- * lots of contention on the AIL lists.
- */
lsn = lip->li_lsn;
+ target = ailp->xa_target;
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
int lock_result;
+
/*
- * If we can lock the item without sleeping, unlock the AIL
- * lock and flush the item. Then re-grab the AIL lock so we
- * can look for the next item on the AIL. List changes are
- * handled by the AIL lookup functions internally
- *
- * If we can't lock the item, either its holder will flush it
- * or it is already being flushed or it is being relogged. In
- * any of these case it is being taken care of and we can just
- * skip to the next item in the list.
+ * Note that IOP_PUSH may unlock and reacquire the AIL lock. We
+ * rely on the AIL cursor implementation to be able to deal with
+ * the dropped lock.
*/
- lock_result = IOP_TRYLOCK(lip);
- spin_unlock(&ailp->xa_lock);
+ lock_result = IOP_PUSH(lip, &ailp->xa_buf_list);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
XFS_STATS_INC(xs_push_ail_success);
trace_xfs_ail_push(lip);
- IOP_PUSH(lip);
ailp->xa_last_pushed_lsn = lsn;
break;
- case XFS_ITEM_PUSHBUF:
- XFS_STATS_INC(xs_push_ail_pushbuf);
- trace_xfs_ail_pushbuf(lip);
-
- if (!IOP_PUSHBUF(lip)) {
- trace_xfs_ail_pushbuf_pinned(lip);
- stuck++;
- ailp->xa_log_flush++;
- } else {
- ailp->xa_last_pushed_lsn = lsn;
- }
- push_xfsbufd = 1;
+ case XFS_ITEM_FLUSHING:
+ /*
+ * The item or its backing buffer is already beeing
+ * flushed. The typical reason for that is that an
+ * inode buffer is locked because we already pushed the
+ * updates to it as part of inode clustering.
+ *
+ * We do not want to to stop flushing just because lots
+ * of items are already beeing flushed, but we need to
+ * re-try the flushing relatively soon if most of the
+ * AIL is beeing flushed.
+ */
+ XFS_STATS_INC(xs_push_ail_flushing);
+ trace_xfs_ail_flushing(lip);
+
+ flushing++;
+ ailp->xa_last_pushed_lsn = lsn;
break;
case XFS_ITEM_PINNED:
@@ -453,28 +441,22 @@ xfsaild_push(
stuck++;
ailp->xa_log_flush++;
break;
-
case XFS_ITEM_LOCKED:
XFS_STATS_INC(xs_push_ail_locked);
trace_xfs_ail_locked(lip);
+
stuck++;
break;
-
default:
ASSERT(0);
break;
}
- spin_lock(&ailp->xa_lock);
- /* should we bother continuing? */
- if (XFS_FORCED_SHUTDOWN(mp))
- break;
- ASSERT(mp->m_log);
-
count++;
/*
* Are there too many items we can't do anything with?
+ *
* If we we are skipping too many items because we can't flush
* them or they are already being flushed, we back off and
* given them time to complete whatever operation is being
@@ -496,42 +478,36 @@ xfsaild_push(
xfs_trans_ail_cursor_done(ailp, &cur);
spin_unlock(&ailp->xa_lock);
- if (push_xfsbufd) {
- /* we've got delayed write buffers to flush */
- wake_up_process(mp->m_ddev_targp->bt_task);
- }
+ if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
+ ailp->xa_log_flush++;
- /* assume we have more work to do in a short while */
+ if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
out_done:
- if (!count) {
- /* We're past our target or empty, so idle */
- ailp->xa_last_pushed_lsn = 0;
- ailp->xa_log_flush = 0;
-
- tout = 50;
- } else if (XFS_LSN_CMP(lsn, target) >= 0) {
/*
- * We reached the target so wait a bit longer for I/O to
- * complete and remove pushed items from the AIL before we
- * start the next scan from the start of the AIL.
+ * We reached the target or the AIL is empty, so wait a bit
+ * longer for I/O to complete and remove pushed items from the
+ * AIL before we start the next scan from the start of the AIL.
*/
tout = 50;
ailp->xa_last_pushed_lsn = 0;
- } else if ((stuck * 100) / count > 90) {
+ } else if (((stuck + flushing) * 100) / count > 90) {
/*
- * Either there is a lot of contention on the AIL or we
- * are stuck due to operations in progress. "Stuck" in this
- * case is defined as >90% of the items we tried to push
- * were stuck.
+ * Either there is a lot of contention on the AIL or we are
+ * stuck due to operations in progress. "Stuck" in this case
+ * is defined as >90% of the items we tried to push were stuck.
*
* Backoff a bit more to allow some I/O to complete before
- * restarting from the start of the AIL. This prevents us
- * from spinning on the same items, and if they are pinned will
- * all the restart to issue a log force to unpin the stuck
- * items.
+ * restarting from the start of the AIL. This prevents us from
+ * spinning on the same items, and if they are pinned will all
+ * the restart to issue a log force to unpin the stuck items.
*/
tout = 20;
ailp->xa_last_pushed_lsn = 0;
+ } else {
+ /*
+ * Assume we have more work to do in a short while.
+ */
+ tout = 10;
}
return tout;
@@ -544,6 +520,8 @@ xfsaild(
struct xfs_ail *ailp = data;
long tout = 0; /* milliseconds */
+ current->flags |= PF_MEMALLOC;
+
while (!kthread_should_stop()) {
if (tout && tout <= 20)
__set_current_state(TASK_KILLABLE);
@@ -611,6 +589,30 @@ xfs_ail_push_all(
}
/*
+ * Push out all items in the AIL immediately and wait until the AIL is empty.
+ */
+void
+xfs_ail_push_all_sync(
+ struct xfs_ail *ailp)
+{
+ struct xfs_log_item *lip;
+ DEFINE_WAIT(wait);
+
+ spin_lock(&ailp->xa_lock);
+ while ((lip = xfs_ail_max(ailp)) != NULL) {
+ prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE);
+ ailp->xa_target = lip->li_lsn;
+ wake_up_process(ailp->xa_task);
+ spin_unlock(&ailp->xa_lock);
+ schedule();
+ spin_lock(&ailp->xa_lock);
+ }
+ spin_unlock(&ailp->xa_lock);
+
+ finish_wait(&ailp->xa_empty, &wait);
+}
+
+/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
* @xfs_trans_ail_update takes an array of log items that all need to be
@@ -667,11 +669,15 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
- spin_unlock(&ailp->xa_lock);
- if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
- xlog_assign_tail_lsn(ailp->xa_mount);
+ if (mlip_changed) {
+ if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+ xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ spin_unlock(&ailp->xa_lock);
+
xfs_log_space_wake(ailp->xa_mount);
+ } else {
+ spin_unlock(&ailp->xa_lock);
}
}
@@ -700,7 +706,8 @@ void
xfs_trans_ail_delete_bulk(
struct xfs_ail *ailp,
struct xfs_log_item **log_items,
- int nr_items) __releases(ailp->xa_lock)
+ int nr_items,
+ int shutdown_type) __releases(ailp->xa_lock)
{
xfs_log_item_t *mlip;
int mlip_changed = 0;
@@ -718,7 +725,7 @@ xfs_trans_ail_delete_bulk(
xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
"%s: attempting to delete a log item that is not in the AIL",
__func__);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_force_shutdown(mp, shutdown_type);
}
return;
}
@@ -729,28 +736,20 @@ xfs_trans_ail_delete_bulk(
if (mlip == lip)
mlip_changed = 1;
}
- spin_unlock(&ailp->xa_lock);
- if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
- xlog_assign_tail_lsn(ailp->xa_mount);
+ if (mlip_changed) {
+ if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+ xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ if (list_empty(&ailp->xa_ail))
+ wake_up_all(&ailp->xa_empty);
+ spin_unlock(&ailp->xa_lock);
+
xfs_log_space_wake(ailp->xa_mount);
+ } else {
+ spin_unlock(&ailp->xa_lock);
}
}
-/*
- * The active item list (AIL) is a doubly linked list of log
- * items sorted by ascending lsn. The base of the list is
- * a forw/back pointer pair embedded in the xfs mount structure.
- * The base is initialized with both pointers pointing to the
- * base. This case always needs to be distinguished, because
- * the base has no lsn to look at. We almost always insert
- * at the end of the list, so on inserts we search from the
- * end of the list to find where the new item belongs.
- */
-
-/*
- * Initialize the doubly linked list to point only to itself.
- */
int
xfs_trans_ail_init(
xfs_mount_t *mp)
@@ -765,6 +764,8 @@ xfs_trans_ail_init(
INIT_LIST_HEAD(&ailp->xa_ail);
INIT_LIST_HEAD(&ailp->xa_cursors);
spin_lock_init(&ailp->xa_lock);
+ INIT_LIST_HEAD(&ailp->xa_buf_list);
+ init_waitqueue_head(&ailp->xa_empty);
ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
ailp->xa_mount->m_fsname);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 1302d1d95a5..21c5a5e3700 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -33,7 +31,6 @@
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_trace.h"
/*
@@ -56,7 +53,7 @@ xfs_trans_buf_item_match(
if (blip->bli_item.li_type == XFS_LI_BUF &&
blip->bli_buf->b_target == target &&
XFS_BUF_ADDR(blip->bli_buf) == blkno &&
- XFS_BUF_COUNT(blip->bli_buf) == len)
+ BBTOB(blip->bli_buf->b_length) == len)
return blip->bli_buf;
}
@@ -141,15 +138,11 @@ xfs_trans_get_buf(xfs_trans_t *tp,
xfs_buf_t *bp;
xfs_buf_log_item_t *bip;
- if (flags == 0)
- flags = XBF_LOCK | XBF_MAPPED;
-
/*
* Default to a normal get_buf() call if the tp is NULL.
*/
if (tp == NULL)
- return xfs_buf_get(target_dev, blkno, len,
- flags | XBF_DONT_BLOCK);
+ return xfs_buf_get(target_dev, blkno, len, flags);
/*
* If we find the buffer in the cache with this transaction
@@ -165,14 +158,6 @@ xfs_trans_get_buf(xfs_trans_t *tp,
XFS_BUF_DONE(bp);
}
- /*
- * If the buffer is stale then it was binval'ed
- * since last read. This doesn't matter since the
- * caller isn't allowed to use the data anyway.
- */
- else if (XFS_BUF_ISSTALE(bp))
- ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
-
ASSERT(bp->b_transp == tp);
bip = bp->b_fspriv;
ASSERT(bip != NULL);
@@ -182,15 +167,7 @@ xfs_trans_get_buf(xfs_trans_t *tp,
return (bp);
}
- /*
- * We always specify the XBF_DONT_BLOCK flag within a transaction
- * so that get_buf does not try to push out a delayed write buffer
- * which might cause another transaction to take place (if the
- * buffer was delayed alloc). Such recursive transactions can
- * easily deadlock with our current transaction as well as cause
- * us to run out of stack space.
- */
- bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
+ bp = xfs_buf_get(target_dev, blkno, len, flags);
if (bp == NULL) {
return NULL;
}
@@ -282,14 +259,13 @@ xfs_trans_read_buf(
xfs_buf_log_item_t *bip;
int error;
- if (flags == 0)
- flags = XBF_LOCK | XBF_MAPPED;
+ *bpp = NULL;
/*
* Default to a normal get_buf() call if the tp is NULL.
*/
if (tp == NULL) {
- bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
+ bp = xfs_buf_read(target, blkno, len, flags);
if (!bp)
return (flags & XBF_TRYLOCK) ?
EAGAIN : XFS_ERROR(ENOMEM);
@@ -297,6 +273,8 @@ xfs_trans_read_buf(
if (bp->b_error) {
error = bp->b_error;
xfs_buf_ioerror_alert(bp, __func__);
+ XFS_BUF_UNDONE(bp);
+ xfs_buf_stale(bp);
xfs_buf_relse(bp);
return error;
}
@@ -371,15 +349,7 @@ xfs_trans_read_buf(
return 0;
}
- /*
- * We always specify the XBF_DONT_BLOCK flag within a transaction
- * so that get_buf does not try to push out a delayed write buffer
- * which might cause another transaction to take place (if the
- * buffer was delayed alloc). Such recursive transactions can
- * easily deadlock with our current transaction as well as cause
- * us to run out of stack space.
- */
- bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
+ bp = xfs_buf_read(target, blkno, len, flags);
if (bp == NULL) {
*bpp = NULL;
return (flags & XBF_TRYLOCK) ?
@@ -418,19 +388,6 @@ xfs_trans_read_buf(
return 0;
shutdown_abort:
- /*
- * the theory here is that buffer is good but we're
- * bailing out because the filesystem is being forcibly
- * shut down. So we should leave the b_flags alone since
- * the buffer's not staled and just get out.
- */
-#if defined(DEBUG)
- if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
- xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
-#endif
- ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) !=
- (XBF_STALE|XBF_DELWRI));
-
trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
xfs_buf_relse(bp);
*bpp = NULL;
@@ -606,7 +563,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
- ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
+ ASSERT(first <= last && last < BBTOB(bp->b_length));
ASSERT(bp->b_iodone == NULL ||
bp->b_iodone == xfs_buf_iodone_callbacks);
@@ -626,8 +583,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bp->b_iodone = xfs_buf_iodone_callbacks;
bip->bli_item.li_cb = xfs_buf_iodone;
- xfs_buf_delwri_queue(bp);
-
trace_xfs_trans_log_buf(bip);
/*
@@ -651,22 +606,33 @@ xfs_trans_log_buf(xfs_trans_t *tp,
/*
- * This called to invalidate a buffer that is being used within
- * a transaction. Typically this is because the blocks in the
- * buffer are being freed, so we need to prevent it from being
- * written out when we're done. Allowing it to be written again
- * might overwrite data in the free blocks if they are reallocated
- * to a file.
+ * Invalidate a buffer that is being used within a transaction.
+ *
+ * Typically this is because the blocks in the buffer are being freed, so we
+ * need to prevent it from being written out when we're done. Allowing it
+ * to be written again might overwrite data in the free blocks if they are
+ * reallocated to a file.
+ *
+ * We prevent the buffer from being written out by marking it stale. We can't
+ * get rid of the buf log item at this point because the buffer may still be
+ * pinned by another transaction. If that is the case, then we'll wait until
+ * the buffer is committed to disk for the last time (we can tell by the ref
+ * count) and free it in xfs_buf_item_unpin(). Until that happens we will
+ * keep the buffer locked so that the buffer and buf log item are not reused.
+ *
+ * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log
+ * the buf item. This will be used at recovery time to determine that copies
+ * of the buffer in the log before this should not be replayed.
*
- * We prevent the buffer from being written out by clearing the
- * B_DELWRI flag. We can't always
- * get rid of the buf log item at this point, though, because
- * the buffer may still be pinned by another transaction. If that
- * is the case, then we'll wait until the buffer is committed to
- * disk for the last time (we can tell by the ref count) and
- * free it in xfs_buf_item_unpin(). Until it is cleaned up we
- * will keep the buffer locked so that the buffer and buf log item
- * are not reused.
+ * We mark the item descriptor and the transaction dirty so that we'll hold
+ * the buffer until after the commit.
+ *
+ * Since we're invalidating the buffer, we also clear the state about which
+ * parts of the buffer have been logged. We also clear the flag indicating
+ * that this is an inode buffer since the data in the buffer will no longer
+ * be valid.
+ *
+ * We set the stale bit in the buffer as well since we're getting rid of it.
*/
void
xfs_trans_binval(
@@ -686,7 +652,6 @@ xfs_trans_binval(
* If the buffer is already invalidated, then
* just return.
*/
- ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
ASSERT(XFS_BUF_ISSTALE(bp));
ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
@@ -696,27 +661,8 @@ xfs_trans_binval(
return;
}
- /*
- * Clear the dirty bit in the buffer and set the STALE flag
- * in the buf log item. The STALE flag will be used in
- * xfs_buf_item_unpin() to determine if it should clean up
- * when the last reference to the buf item is given up.
- * We set the XFS_BLF_CANCEL flag in the buf log format structure
- * and log the buf item. This will be used at recovery time
- * to determine that copies of the buffer in the log before
- * this should not be replayed.
- * We mark the item descriptor and the transaction dirty so
- * that we'll hold the buffer until after the commit.
- *
- * Since we're invalidating the buffer, we also clear the state
- * about which parts of the buffer have been logged. We also
- * clear the flag indicating that this is an inode buffer since
- * the data in the buffer will no longer be valid.
- *
- * We set the stale bit in the buffer as well since we're getting
- * rid of it.
- */
xfs_buf_stale(bp);
+
bip->bli_flags |= XFS_BLI_STALE;
bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 279099717ed..bcb60542fcf 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f7590f5bade..8d71b16ecca 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 7a7442c03f2..d2eee20d5f5 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 8ab2ced415f..fb62377d1cb 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -71,6 +71,8 @@ struct xfs_ail {
spinlock_t xa_lock;
xfs_lsn_t xa_last_pushed_lsn;
int xa_log_flush;
+ struct list_head xa_buf_list;
+ wait_queue_head_t xa_empty;
};
/*
@@ -90,18 +92,22 @@ xfs_trans_ail_update(
}
void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
- struct xfs_log_item **log_items, int nr_items)
+ struct xfs_log_item **log_items, int nr_items,
+ int shutdown_type)
__releases(ailp->xa_lock);
static inline void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
- xfs_log_item_t *lip) __releases(ailp->xa_lock)
+ xfs_log_item_t *lip,
+ int shutdown_type) __releases(ailp->xa_lock)
{
- xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+ xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
}
void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
void xfs_ail_push_all(struct xfs_ail *);
+void xfs_ail_push_all_sync(struct xfs_ail *);
+struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp);
xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp);
struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 65584b55607..398cf681d02 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -57,6 +57,7 @@ typedef __uint64_t __psunsigned_t;
#endif /* __KERNEL__ */
typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */
+typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef __uint32_t xfs_extlen_t; /* extent length in blocks */
typedef __uint32_t xfs_agnumber_t; /* allocation group number */
typedef __int32_t xfs_extnum_t; /* # of extents in a file */
@@ -101,6 +102,7 @@ typedef __uint64_t xfs_fileoff_t; /* block number in a file */
typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
+
/*
* Null values for the types.
*/
@@ -120,6 +122,9 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
#define NULLCOMMITLSN ((xfs_lsn_t)-1)
+#define NULLFSINO ((xfs_ino_t)-1)
+#define NULLAGINO ((xfs_agino_t)-1)
+
/*
* Max values for extlen, extnum, aextnum.
*/
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 79c05ac85bf..4e5b9ad5cb9 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 64981d7e737..b6a82d817a8 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -21,7 +21,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -39,7 +38,6 @@
#include "xfs_bmap.h"
#include "xfs_acl.h"
#include "xfs_attr.h"
-#include "xfs_rw.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_utils.h"
@@ -81,8 +79,7 @@ xfs_readlink_bmap(
d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
- XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
+ bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
if (!bp)
return XFS_ERROR(ENOMEM);
error = bp->b_error;
@@ -1919,7 +1916,7 @@ xfs_alloc_file_space(
error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
xfs_bmap_cancel(&free_list);
- xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
error1: /* Just cancel transaction */
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -1966,7 +1963,7 @@ xfs_zero_remaining_bytes(
bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp,
- mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
+ BTOBB(mp->m_sb.sb_blocksize), 0);
if (!bp)
return XFS_ERROR(ENOMEM);
@@ -2315,17 +2312,33 @@ xfs_change_file_space(
case XFS_IOC_ALLOCSP64:
case XFS_IOC_FREESP:
case XFS_IOC_FREESP64:
+ /*
+ * These operations actually do IO when extending the file, but
+ * the allocation is done seperately to the zeroing that is
+ * done. This set of operations need to be serialised against
+ * other IO operations, such as truncate and buffered IO. We
+ * need to take the IOLOCK here to serialise the allocation and
+ * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
+ * truncate, direct IO) from racing against the transient
+ * allocated but not written state we can have here.
+ */
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
if (startoffset > fsize) {
error = xfs_alloc_file_space(ip, fsize,
- startoffset - fsize, 0, attr_flags);
- if (error)
+ startoffset - fsize, 0,
+ attr_flags | XFS_ATTR_NOLOCK);
+ if (error) {
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
break;
+ }
}
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = startoffset;
- error = xfs_setattr_size(ip, &iattr, attr_flags);
+ error = xfs_setattr_size(ip, &iattr,
+ attr_flags | XFS_ATTR_NOLOCK);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
if (error)
return error;