summaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h439
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c8
-rw-r--r--fs/xfs/quota/xfs_dquot.c6
-rw-r--r--fs/xfs/quota/xfs_qm.c4
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_ag.h25
-rw-r--r--fs/xfs/xfs_alloc.c357
-rw-r--r--fs/xfs/xfs_alloc.h7
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_buf_item.c166
-rw-r--r--fs/xfs/xfs_buf_item.h18
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_iget.c29
-rw-r--r--fs/xfs/xfs_inode.c144
-rw-r--r--fs/xfs/xfs_log.c120
-rw-r--r--fs/xfs/xfs_log.h14
-rw-r--r--fs/xfs/xfs_log_cil.c725
-rw-r--r--fs/xfs/xfs_log_priv.h118
-rw-r--r--fs/xfs/xfs_log_recover.c57
-rw-r--r--fs/xfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/xfs_mount.c68
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_rtalloc.c4
-rw-r--r--fs/xfs/xfs_rtalloc.h11
-rw-r--r--fs/xfs/xfs_trans.c590
-rw-r--r--fs/xfs/xfs_trans.h455
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_item.c114
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_types.h2
-rw-r--r--fs/xfs/xfs_vnodeops.c2
42 files changed, 2318 insertions, 1329 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8b..c8fb13f83b3 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
xfs_itable.o \
xfs_dfrag.o \
xfs_log.o \
+ xfs_log_cil.o \
xfs_log_recover.o \
xfs_mount.o \
xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index a7bc925c4d6..9f769b5b38f 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
return error;
}
-struct xattr_handler xfs_xattr_acl_access_handler = {
+const struct xattr_handler xfs_xattr_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_ACCESS,
.get = xfs_xattr_acl_get,
.set = xfs_xattr_acl_set,
};
-struct xattr_handler xfs_xattr_acl_default_handler = {
+const struct xattr_handler xfs_xattr_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.get = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 089eaca860b..34640d6dbdc 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1333,6 +1333,21 @@ xfs_vm_writepage(
trace_xfs_writepage(inode, page, 0);
/*
+ * Refuse to write the page out if we are called from reclaim context.
+ *
+ * This is primarily to avoid stack overflows when called from deep
+ * used stacks in random callers for direct reclaim, but disabling
+ * reclaim for kswap is a nice side-effect as kswapd causes rather
+ * suboptimal I/O patters, too.
+ *
+ * This should really be done by the core VM, but until that happens
+ * filesystems like XFS, btrfs and ext4 have to take care of this
+ * by themselves.
+ */
+ if (current->flags & PF_MEMALLOC)
+ goto out_fail;
+
+ /*
* We need a transaction if:
* 1. There are delalloc buffers on the page
* 2. The page is uptodate and we have unmapped buffers
@@ -1366,14 +1381,6 @@ xfs_vm_writepage(
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
- /*
- * VM calculation for nr_to_write seems off. Bump it way
- * up, this gets simple streaming writes zippy again.
- * To be reviewed again after Jens' writeback changes.
- */
- wbc->nr_to_write *= 4;
-
/*
* Convert delayed allocate, unwritten or unmapped space
* to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index f01de3c55c4..649ade8ef59 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
#include "xfs_sb.h"
#include "xfs_inum.h"
+#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
* Note that this in no way locks the underlying pages, so it is only
* useful for synchronizing concurrent use of buffer objects, not for
* synchronizing independent access to the underlying pages.
+ *
+ * If we come across a stale, pinned, locked buffer, we know that we
+ * are being asked to lock a buffer that has been reallocated. Because
+ * it is pinned, we know that the log has not been pushed to disk and
+ * hence it will still be locked. Rather than sleeping until someone
+ * else pushes the log, push it ourselves before trying to get the lock.
*/
void
xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
{
trace_xfs_buf_lock(bp, _RET_IP_);
+ if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+ xfs_log_force(bp->b_mount, 0);
if (atomic_read(&bp->b_io_remaining))
blk_run_address_space(bp->b_target->bt_mapping);
down(&bp->b_sema);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index d8fb1b5d6cb..257a56b127c 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -100,10 +100,10 @@ xfs_iozero(
STATIC int
xfs_file_fsync(
struct file *file,
- struct dentry *dentry,
int datasync)
{
- struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
struct xfs_trans *tp;
int error = 0;
int log_flushed = 0;
@@ -140,8 +140,8 @@ xfs_file_fsync(
* might gets cleared when the inode gets written out via the AIL
* or xfs_iflush_cluster.
*/
- if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
- ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+ if (((inode->i_state & I_DIRTY_DATASYNC) ||
+ ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
ip->i_update_core) {
/*
* Kick off a transaction to log the inode core to get the
@@ -868,7 +868,7 @@ write_retry:
mutex_lock(&inode->i_mutex);
xfs_ilock(ip, iolock);
- error2 = -xfs_file_fsync(file, file->f_path.dentry,
+ error2 = -xfs_file_fsync(file,
(file->f_flags & __O_SYNC) ? 0 : 1);
if (!error)
error = error2;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 9c8019c78c9..44f0b2de153 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -585,11 +585,20 @@ xfs_vn_fallocate(
bf.l_len = len;
xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ /* check the new inode size is valid before allocating */
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ error = inode_newsize_ok(inode, new_size);
+ if (error)
+ goto out_unlock;
+ }
+
error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
0, XFS_ATTR_NOLOCK);
- if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode))
- new_size = offset + len;
+ if (error)
+ goto out_unlock;
/* Change file size if needed */
if (new_size) {
@@ -600,6 +609,7 @@ xfs_vn_fallocate(
error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
}
+out_unlock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
out_error:
return error;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index e31bf21fe5d..067cafbfc63 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,10 +19,10 @@
#include "xfs_dmapi.h"
#include "xfs_sb.h"
#include "xfs_inum.h"
+#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_quota.h"
-#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index e9002513e08..f2d1718c916 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
/*
* Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DMAPI;
} else if (!strcmp(this_char, MNTOPT_DMI)) {
mp->m_flags |= XFS_MOUNT_DMAPI;
+ } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+ mp->m_flags |= XFS_MOUNT_DELAYLOG;
+ cmn_err(CE_WARN,
+ "Enabling EXPERIMENTAL delayed logging feature "
+ "- use at your own risk.\n");
+ } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+ mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
} else if (!strcmp(this_char, "ihashsize")) {
cmn_err(CE_WARN,
"XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
{ XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
+ { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
@@ -725,7 +735,8 @@ void
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, NULL);
+ blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
}
STATIC void
@@ -1754,7 +1765,7 @@ xfs_init_zones(void)
* but it is much faster.
*/
xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
- (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+ (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
NBWORD) * sizeof(int))), "xfs_buf_item");
if (!xfs_buf_item_zone)
goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b..519618e9279 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern const struct export_operations xfs_export_operations;
-extern struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler *xfs_xattr_handlers[];
extern const struct quotactl_ops xfs_quotactl_operations;
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3884e20bc14..ef7f0218bcc 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -164,10 +164,6 @@ xfs_inode_ag_iterator(
struct xfs_perag *pag;
pag = xfs_perag_get(mp, ag);
- if (!pag->pag_ici_init) {
- xfs_perag_put(pag);
- continue;
- }
error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
exclusive, &nr);
xfs_perag_put(pag);
@@ -867,12 +863,7 @@ xfs_reclaim_inode_shrink(
down_read(&xfs_mount_list_lock);
list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-
pag = xfs_perag_get(mp, ag);
- if (!pag->pag_ici_init) {
- xfs_perag_put(pag);
- continue;
- }
reclaimable += pag->pag_ici_reclaimable;
xfs_perag_put(pag);
}
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 207fa77f63a..d12be8470cb 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -50,7 +50,6 @@
#include "quota/xfs_dquot_item.h"
#include "quota/xfs_dquot.h"
#include "xfs_log_recover.h"
-#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
/*
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8a319cfd290..73d5aa11738 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -82,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
)
)
-#define DEFINE_PERAG_REF_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
- unsigned long caller_ip), \
- TP_ARGS(mp, agno, refcount, caller_ip), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_agnumber_t, agno) \
- __field(int, refcount) \
- __field(unsigned long, caller_ip) \
- ), \
- TP_fast_assign( \
- __entry->dev = mp->m_super->s_dev; \
- __entry->agno = agno; \
- __entry->refcount = refcount; \
- __entry->caller_ip = caller_ip; \
- ), \
- TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->agno, \
- __entry->refcount, \
- (char *)__entry->caller_ip) \
-);
-
-DEFINE_PERAG_REF_EVENT(xfs_perag_get)
-DEFINE_PERAG_REF_EVENT(xfs_perag_put)
-
#define DEFINE_ATTR_LIST_EVENT(name) \
DEFINE_EVENT(xfs_attr_list_class, name, \
TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -122,6 +95,37 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DECLARE_EVENT_CLASS(xfs_perag_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+ unsigned long caller_ip),
+ TP_ARGS(mp, agno, refcount, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, refcount)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->refcount = refcount;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->refcount,
+ (char *)__entry->caller_ip)
+);
+
+#define DEFINE_PERAG_REF_EVENT(name) \
+DEFINE_EVENT(xfs_perag_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+
TRACE_EVENT(xfs_attr_list_node_descend,
TP_PROTO(struct xfs_attr_list_context *ctx,
struct xfs_da_node_entry *btree),
@@ -775,165 +779,181 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
-#define DEFINE_RW_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
- TP_ARGS(ip, count, offset, flags), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_ino_t, ino) \
- __field(xfs_fsize_t, size) \
- __field(xfs_fsize_t, new_size) \
- __field(loff_t, offset) \
- __field(size_t, count) \
- __field(int, flags) \
- ), \
- TP_fast_assign( \
- __entry->dev = VFS_I(ip)->i_sb->s_dev; \
- __entry->ino = ip->i_ino; \
- __entry->size = ip->i_d.di_size; \
- __entry->new_size = ip->i_new_size; \
- __entry->offset = offset; \
- __entry->count = count; \
- __entry->flags = flags; \
- ), \
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
- "offset 0x%llx count 0x%zx ioflags %s", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->ino, \
- __entry->size, \
- __entry->new_size, \
- __entry->offset, \
- __entry->count, \
- __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
+DECLARE_EVENT_CLASS(xfs_file_class,
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
+ TP_ARGS(ip, count, offset, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_fsize_t, new_size)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = ip->i_new_size;
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ "offset 0x%llx count 0x%zx ioflags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size,
+ __entry->offset,
+ __entry->count,
+ __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
)
+
+#define DEFINE_RW_EVENT(name) \
+DEFINE_EVENT(xfs_file_class, name, \
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+ TP_ARGS(ip, count, offset, flags))
DEFINE_RW_EVENT(xfs_file_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
DEFINE_RW_EVENT(xfs_file_splice_write);
-
-#define DEFINE_PAGE_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
- TP_ARGS(inode, page, off), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_ino_t, ino) \
- __field(pgoff_t, pgoff) \
- __field(loff_t, size) \
- __field(unsigned long, offset) \
- __field(int, delalloc) \
- __field(int, unmapped) \
- __field(int, unwritten) \
- ), \
- TP_fast_assign( \
- int delalloc = -1, unmapped = -1, unwritten = -1; \
- \
- if (page_has_buffers(page)) \
- xfs_count_page_state(page, &delalloc, \
- &unmapped, &unwritten); \
- __entry->dev = inode->i_sb->s_dev; \
- __entry->ino = XFS_I(inode)->i_ino; \
- __entry->pgoff = page_offset(page); \
- __entry->size = i_size_read(inode); \
- __entry->offset = off; \
- __entry->delalloc = delalloc; \
- __entry->unmapped = unmapped; \
- __entry->unwritten = unwritten; \
- ), \
- TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
- "delalloc %d unmapped %d unwritten %d", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->ino, \
- __entry->pgoff, \
- __entry->size, \
- __entry->offset, \
- __entry->delalloc, \
- __entry->unmapped, \
- __entry->unwritten) \
+DECLARE_EVENT_CLASS(xfs_page_class,
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
+ TP_ARGS(inode, page, off),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(pgoff_t, pgoff)
+ __field(loff_t, size)
+ __field(unsigned long, offset)
+ __field(int, delalloc)
+ __field(int, unmapped)
+ __field(int, unwritten)
+ ),
+ TP_fast_assign(
+ int delalloc = -1, unmapped = -1, unwritten = -1;
+
+ if (page_has_buffers(page))
+ xfs_count_page_state(page, &delalloc,
+ &unmapped, &unwritten);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = XFS_I(inode)->i_ino;
+ __entry->pgoff = page_offset(page);
+ __entry->size = i_size_read(inode);
+ __entry->offset = off;
+ __entry->delalloc = delalloc;
+ __entry->unmapped = unmapped;
+ __entry->unwritten = unwritten;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+ "delalloc %d unmapped %d unwritten %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pgoff,
+ __entry->size,
+ __entry->offset,
+ __entry->delalloc,
+ __entry->unmapped,
+ __entry->unwritten)
)
+
+#define DEFINE_PAGE_EVENT(name) \
+DEFINE_EVENT(xfs_page_class, name, \
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
+ TP_ARGS(inode, page, off))
DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
-#define DEFINE_IOMAP_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
- int flags, struct xfs_bmbt_irec *irec), \
- TP_ARGS(ip, offset, count, flags, irec), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_ino_t, ino) \
- __field(loff_t, size) \
- __field(loff_t, new_size) \
- __field(loff_t, offset) \
- __field(size_t, count) \
- __field(int, flags) \
- __field(xfs_fileoff_t, startoff) \
- __field(xfs_fsblock_t, startblock) \
- __field(xfs_filblks_t, blockcount) \
- ), \
- TP_fast_assign( \
- __entry->dev = VFS_I(ip)->i_sb->s_dev; \
- __entry->ino = ip->i_ino; \
- __entry->size = ip->i_d.di_size; \
- __entry->new_size = ip->i_new_size; \
- __entry->offset = offset; \
- __entry->count = count; \
- __entry->flags = flags; \
- __entry->startoff = irec ? irec->br_startoff : 0; \
- __entry->startblock = irec ? irec->br_startblock : 0; \
- __entry->blockcount = irec ? irec->br_blockcount : 0; \
- ), \
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
- "offset 0x%llx count %zd flags %s " \
- "startoff 0x%llx startblock %lld blockcount 0x%llx", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->ino, \
- __entry->size, \
- __entry->new_size, \
- __entry->offset, \
- __entry->count, \
- __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
- __entry->startoff, \
- (__int64_t)__entry->startblock, \
- __entry->blockcount) \
+DECLARE_EVENT_CLASS(xfs_iomap_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+ int flags, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, flags, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(loff_t, size)
+ __field(loff_t, new_size)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, flags)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_filblks_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = ip->i_new_size;
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->flags = flags;
+ __entry->startoff = irec ? irec->br_startoff : 0;
+ __entry->startblock = irec ? irec->br_startblock : 0;
+ __entry->blockcount = irec ? irec->br_blockcount : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ "offset 0x%llx count %zd flags %s "
+ "startoff 0x%llx startblock %lld blockcount 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size,
+ __entry->offset,
+ __entry->count,
+ __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+ __entry->startoff,
+ (__int64_t)__entry->startblock,
+ __entry->blockcount)
)
+
+#define DEFINE_IOMAP_EVENT(name) \
+DEFINE_EVENT(xfs_iomap_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+ int flags, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, flags, irec))
DEFINE_IOMAP_EVENT(xfs_iomap_enter);
DEFINE_IOMAP_EVENT(xfs_iomap_found);
DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-#define DEFINE_SIMPLE_IO_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
- TP_ARGS(ip, offset, count), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_ino_t, ino) \
- __field(loff_t, size) \
- __field(loff_t, new_size) \
- __field(loff_t, offset) \
- __field(size_t, count) \
- ), \
- TP_fast_assign( \
- __entry->dev = VFS_I(ip)->i_sb->s_dev; \
- __entry->ino = ip->i_ino; \
- __entry->size = ip->i_d.di_size; \
- __entry->new_size = ip->i_new_size; \
- __entry->offset = offset; \
- __entry->count = count; \
- ), \
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
- "offset 0x%llx count %zd", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->ino, \
- __entry->size, \
- __entry->new_size, \
- __entry->offset, \
- __entry->count) \
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+ TP_ARGS(ip, offset, count),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(loff_t, size)
+ __field(loff_t, new_size)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = ip->i_new_size;
+ __entry->offset = offset;
+ __entry->count = count;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ "offset 0x%llx count %zd",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size,
+ __entry->offset,
+ __entry->count)
);
+
+#define DEFINE_SIMPLE_IO_EVENT(name) \
+DEFINE_EVENT(xfs_simple_io_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+ TP_ARGS(ip, offset, count))
DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
@@ -1059,83 +1079,112 @@ TRACE_EVENT(xfs_bunmap,
);
+#define XFS_BUSY_SYNC \
+ { 0, "async" }, \
+ { 1, "sync" }
+
TRACE_EVENT(xfs_alloc_busy,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
- xfs_extlen_t len, int slot),
- TP_ARGS(mp, agno, agbno, len, slot),
+ TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len, int sync),
+ TP_ARGS(trans, agno, agbno, len, sync),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(struct xfs_trans *, tp)
+ __field(int, tid)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
- __field(int, slot)
+ __field(int, sync)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
+ __entry->dev = trans->t_mountp->m_super->s_dev;
+ __entry->tp = trans;
+ __entry->tid = trans->t_ticket->t_tid;
__entry->agno = agno;
__entry->agbno = agbno;
__entry->len = len;
- __entry->slot = slot;
+ __entry->sync = sync;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+ TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tp,
+ __entry->tid,
__entry->agno,
__entry->agbno,
__entry->len,
- __entry->slot)
+ __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
);
-#define XFS_BUSY_STATES \
- { 0, "found" }, \
- { 1, "missing" }
-
TRACE_EVENT(xfs_alloc_unbusy,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int slot, int found),
- TP_ARGS(mp, agno, slot, found),
+ xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
- __field(int, slot)
- __field(int, found)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
- __entry->slot = slot;
- __entry->found = found;
+ __entry->agbno = agbno;
+ __entry->len = len;
),
- TP_printk("dev %d:%d agno %u slot %d %s",
+ TP_printk("dev %d:%d agno %u agbno %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
- __entry->slot,
- __print_symbolic(__entry->found, XFS_BUSY_STATES))
+ __entry->agbno,
+ __entry->len)
);
+#define XFS_BUSY_STATES \
+ { 0, "missing" }, \
+ { 1, "found" }
+
TRACE_EVENT(xfs_alloc_busysearch,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
- xfs_extlen_t len, xfs_lsn_t lsn),
- TP_ARGS(mp, agno, agbno, len, lsn),
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len, int found),
+ TP_ARGS(mp, agno, agbno, len, found),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
- __field(xfs_lsn_t, lsn)
+ __field(int, found)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
__entry->agbno = agbno;
__entry->len = len;
- __entry->lsn = lsn;
+ __entry->found = found;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
+ TP_printk("dev %d:%d agno %u agbno %u len %u %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
__entry->len,
+ __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+
+TRACE_EVENT(xfs_trans_commit_lsn,
+ TP_PROTO(struct xfs_trans *trans),
+ TP_ARGS(trans),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(struct xfs_trans *, tp)
+ __field(xfs_lsn_t, lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = trans->t_mountp->m_super->s_dev;
+ __entry->tp = trans;
+ __entry->lsn = trans->t_commit_lsn;
+ ),
+ TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tp,
__entry->lsn)
);
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6..87d3e03878c 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
(void *)value, size, xflags);
}
-static struct xattr_handler xfs_xattr_user_handler = {
+static const struct xattr_handler xfs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = 0, /* no flags implies user namespace */
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
-static struct xattr_handler xfs_xattr_trusted_handler = {
+static const struct xattr_handler xfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.flags = ATTR_ROOT,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
-static struct xattr_handler xfs_xattr_security_handler = {
+static const struct xattr_handler xfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.flags = ATTR_SECURE,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
-struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler *xfs_xattr_handlers[] = {
&xfs_xattr_user_handler,
&xfs_xattr_trusted_handler,
&xfs_xattr_security_handler,
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index b89ec5df012..585e7633dfc 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk(
for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
xfs_qm_dqinit_core(curid, type, d);
xfs_trans_dquot_buf(tp, bp,
- (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
- ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
- XFS_BLI_GDQUOT_BUF)));
+ (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
+ ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
+ XFS_BLF_GDQUOT_BUF)));
xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 38e76414664..2d8b7bc792c 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -249,8 +249,10 @@ xfs_qm_hold_quotafs_ref(
if (!xfs_Gqm) {
xfs_Gqm = xfs_Gqm_init();
- if (!xfs_Gqm)
+ if (!xfs_Gqm) {
+ mutex_unlock(&xfs_Gqm_lock);
return ENOMEM;
+ }
}
/*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f..0135e2a669d 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
extern int posix_acl_access_exists(struct inode *inode);
extern int posix_acl_default_exists(struct inode *inode);
-extern struct xattr_handler xfs_xattr_acl_access_handler;
-extern struct xattr_handler xfs_xattr_acl_default_handler;
+extern const struct xattr_handler xfs_xattr_acl_access_handler;
+extern const struct xattr_handler xfs_xattr_acl_default_handler;
#else
# define xfs_check_acl NULL
# define xfs_get_acl(inode, type) NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c..4917d4eed4e 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
} xfs_agfl_t;
/*
- * Busy block/extent entry. Used in perag to mark blocks that have been freed
- * but whose transactions aren't committed to disk yet.
+ * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
*/
-typedef struct xfs_perag_busy {
- xfs_agblock_t busy_start;
- xfs_extlen_t busy_length;
- struct xfs_trans *busy_tp; /* transaction that did the free */
-} xfs_perag_busy_t;
+struct xfs_busy_extent {
+ struct rb_node rb_node; /* ag by-bno indexed search tree */
+ struct list_head list; /* transaction busy extent list */
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t length;
+ xlog_tid_t tid; /* transaction that created this */
+};
/*
* Per-ag incore structure, copies of information in agf and agi,
@@ -216,17 +222,16 @@ typedef struct xfs_perag {
xfs_agino_t pagl_leftrec;
xfs_agino_t pagl_rightrec;
#ifdef __KERNEL__
- spinlock_t pagb_lock; /* lock for pagb_list */
+ spinlock_t pagb_lock; /* lock for pagb_tree */
+ struct rb_root pagb_tree; /* ordered tree of busy extents */
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
- int pag_ici_init; /* incore inode cache initialised */
rwlock_t pag_ici_lock; /* incore inode lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */
int pag_ici_reclaimable; /* reclaimable inodes */
#endif
int pagb_count; /* pagb slots in use */
- xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
} xfs_perag_t;
/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb256..a7fbe8a99b1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len);
+static int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len);
/*
* Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
be32_to_cpu(agf->agf_length));
xfs_alloc_log_agf(args->tp, args->agbp,
XFS_AGF_FREEBLKS);
- /* search the busylist for these blocks */
- xfs_alloc_search_busy(args->tp, args->agno,
- args->agbno, args->len);
+ /*
+ * Search the busylist for these blocks and mark the
+ * transaction as synchronous if blocks are found. This
+ * avoids the need to block due to a synchronous log
+ * force to ensure correct ordering as the synchronous
+ * transaction will guarantee that for us.
+ */
+ if (xfs_alloc_busy_search(args->mp, args->agno,
+ args->agbno, args->len))
+ xfs_trans_set_sync(args->tp);
}
if (!args->isfl)
xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
* when the iclog commits to disk. If a busy block is allocated,
* the iclog is pushed up to the LSN that freed the block.
*/
- xfs_alloc_mark_busy(tp, agno, bno, len);
+ xfs_alloc_busy_insert(tp, agno, bno, len);
return 0;
error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
*bnop = bno;
/*
- * As blocks are freed, they are added to the per-ag busy list
- * and remain there until the freeing transaction is committed to
- * disk. Now that we have allocated blocks, this list must be
- * searched to see if a block is being reused. If one is, then
- * the freeing transaction must be pushed to disk NOW by forcing
- * to disk all iclogs up that transaction's LSN.
+ * As blocks are freed, they are added to the per-ag busy list and
+ * remain there until the freeing transaction is committed to disk.
+ * Now that we have allocated blocks, this list must be searched to see
+ * if a block is being reused. If one is, then the freeing transaction
+ * must be pushed to disk before this transaction.
+ *
+ * We do this by setting the current transaction to a sync transaction
+ * which guarantees that the freeing transaction is on disk before this
+ * transaction. This is done instead of a synchronous log force here so
+ * that we don't sit and wait with the AGF locked in the transaction
+ * during the log force.
*/
- xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+ if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
+ xfs_trans_set_sync(tp);
return 0;
}
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
spin_lock_init(&pag->pagb_lock);
pag->pagb_count = 0;
- memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
+ pag->pagb_tree = RB_ROOT;
pag->pagf_init = 1;
}
#ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
* list is reused, the transaction that freed it must be forced to disk
* before continuing to use the block.
*
- * xfs_alloc_mark_busy - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ * xfs_alloc_busy_insert - add to the per-ag busy list
+ * xfs_alloc_busy_clear - remove an item from the per-ag busy list
+ * xfs_alloc_busy_search - search for a busy extent
+ */
+
+/*
+ * Insert a new extent into the busy tree.
+ *
+ * The busy extent tree is indexed by the start block of the busy extent.
+ * there can be multiple overlapping ranges in the busy extent tree but only
+ * ever one entry at a given start block. The reason for this is that
+ * multi-block extents can be freed, then smaller chunks of that extent
+ * allocated and freed again before the first transaction commit is on disk.
+ * If the exact same start block is freed a second time, we have to wait for
+ * that busy extent to pass out of the tree before the new extent is inserted.
+ * There are two main cases we have to handle here.
+ *
+ * The first case is a transaction that triggers a "free - allocate - free"
+ * cycle. This can occur during btree manipulations as a btree block is freed
+ * to the freelist, then allocated from the free list, then freed again. In
+ * this case, the second extxpnet free is what triggers the duplicate and as
+ * such the transaction IDs should match. Because the extent was allocated in
+ * this transaction, the transaction must be marked as synchronous. This is
+ * true for all cases where the free/alloc/free occurs in the one transaction,
+ * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
+ * This serves to catch violations of the second case quite effectively.
+ *
+ * The second case is where the free/alloc/free occur in different
+ * transactions. In this case, the thread freeing the extent the second time
+ * can't mark the extent busy immediately because it is already tracked in a
+ * transaction that may be committing. When the log commit for the existing
+ * busy extent completes, the busy extent will be removed from the tree. If we
+ * allow the second busy insert to continue using that busy extent structure,
+ * it can be freed before this transaction is safely in the log. Hence our
+ * only option in this case is to force the log to remove the existing busy
+ * extent from the list before we insert the new one with the current
+ * transaction ID.
+ *
+ * The problem we are trying to avoid in the free-alloc-free in separate
+ * transactions is most easily described with a timeline:
+ *
+ * Thread 1 Thread 2 Thread 3 xfslogd
+ * xact alloc
+ * free X
+ * mark busy
+ * commit xact
+ * free xact
+ * xact alloc
+ * alloc X
+ * busy search
+ * mark xact sync
+ * commit xact
+ * free xact
+ * force log
+ * checkpoint starts
+ * ....
+ * xact alloc
+ * free X
+ * mark busy
+ * finds match
+ * *** KABOOM! ***
+ * ....
+ * log IO completes
+ * unbusy X
+ * checkpoint completes
+ *
+ * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
+ * the checkpoint completes, and the busy extent it matched will have been
+ * removed from the tree when it is woken. Hence it can then continue safely.
+ *
+ * However, to ensure this matching process is robust, we need to use the
+ * transaction ID for identifying transaction, as delayed logging results in
+ * the busy extent and transaction lifecycles being different. i.e. the busy
+ * extent is active for a lot longer than the transaction. Hence the
+ * transaction structure can be freed and reallocated, then mark the same
+ * extent busy again in the new transaction. In this case the new transaction
+ * will have a different tid but can have the same address, and hence we need
+ * to check against the tid.
+ *
+ * Future: for delayed logging, we could avoid the log force if the extent was
+ * first freed in the current checkpoint sequence. This, however, requires the
+ * ability to pin the current checkpoint in memory until this transaction
+ * commits to ensure that both the original free and the current one combine
+ * logically into the one checkpoint. If the checkpoint sequences are
+ * different, however, we still need to wait on a log force.
*/
void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
+xfs_alloc_busy_insert(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
{
- xfs_perag_busy_t *bsy;
+ struct xfs_busy_extent *new;
+ struct xfs_busy_extent *busyp;
struct xfs_perag *pag;
- int n;
+ struct rb_node **rbp;
+ struct rb_node *parent;
+ int match;
- pag = xfs_perag_get(tp->t_mountp, agno);
- spin_lock(&pag->pagb_lock);
- /* search pagb_list for an open slot */
- for (bsy = pag->pagb_list, n = 0;
- n < XFS_PAGB_NUM_SLOTS;
- bsy++, n++) {
- if (bsy->busy_tp == NULL) {
- break;
- }
+ new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
+ if (!new) {
+ /*
+ * No Memory! Since it is now not possible to track the free
+ * block, make this a synchronous transaction to insure that
+ * the block is not reused before this transaction commits.
+ */
+ trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+ xfs_trans_set_sync(tp);
+ return;
}
- trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
+ new->agno = agno;
+ new->bno = bno;
+ new->length = len;
+ new->tid = xfs_log_get_trans_ident(tp);
- if (n < XFS_PAGB_NUM_SLOTS) {
- bsy = &pag->pagb_list[n];
- pag->pagb_count++;
- bsy->busy_start = bno;
- bsy->busy_length = len;
- bsy->busy_tp = tp;
- xfs_trans_add_busy(tp, agno, n);
- } else {
+ INIT_LIST_HEAD(&new->list);
+
+ /* trace before insert to be able to see failed inserts */
+ trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+
+ pag = xfs_perag_get(tp->t_mountp, new->agno);
+restart:
+ spin_lock(&pag->pagb_lock);
+ rbp = &pag->pagb_tree.rb_node;
+ parent = NULL;
+ busyp = NULL;
+ match = 0;
+ while (*rbp && match >= 0) {
+ parent = *rbp;
+ busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+
+ if (new->bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ rbp = &(*rbp)->rb_left;
+ if (new->bno + new->length > busyp->bno)
+ match = busyp->tid == new->tid ? 1 : -1;
+ } else if (new->bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ rbp = &(*rbp)->rb_right;
+ if (bno < busyp->bno + busyp->length)
+ match = busyp->tid == new->tid ? 1 : -1;
+ } else {
+ match = busyp->tid == new->tid ? 1 : -1;
+ break;
+ }
+ }
+ if (match < 0) {
+ /* overlap marked busy in different transaction */
+ spin_unlock(&pag->pagb_lock);
+ xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
+ goto restart;
+ }
+ if (match > 0) {
/*
- * The busy list is full! Since it is now not possible to
- * track the free block, make this a synchronous transaction
- * to insure that the block is not reused before this
- * transaction commits.
+ * overlap marked busy in same transaction. Update if exact
+ * start block match, otherwise combine the busy extents into
+ * a single range.
*/
- xfs_trans_set_sync(tp);
- }
+ if (busyp->bno == new->bno) {
+ busyp->length = max(busyp->length, new->length);
+ spin_unlock(&pag->pagb_lock);
+ ASSERT(tp->t_flags & XFS_TRANS_SYNC);
+ xfs_perag_put(pag);
+ kmem_free(new);
+ return;
+ }
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ new->length = max(busyp->bno + busyp->length,
+ new->bno + new->length) -
+ min(busyp->bno, new->bno);
+ new->bno = min(busyp->bno, new->bno);
+ } else
+ busyp = NULL;
+ rb_link_node(&new->rb_node, parent, rbp);
+ rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+ list_add(&new->list, &tp->t_busy);
spin_unlock(&pag->pagb_lock);
xfs_perag_put(pag);
+ kmem_free(busyp);
}
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- int idx)
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate. You need to be holding the busy extent tree lock when calling
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+static int
+xfs_alloc_busy_search(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
{
struct xfs_perag *pag;
- xfs_perag_busy_t *list;
+ struct rb_node *rbp;
+ struct xfs_busy_extent *busyp;
+ int match = 0;
- ASSERT(idx < XFS_PAGB_NUM_SLOTS);
- pag = xfs_perag_get(tp->t_mountp, agno);
+ pag = xfs_perag_get(mp, agno);
spin_lock(&pag->pagb_lock);
- list = pag->pagb_list;
- trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
-
- if (list[idx].busy_tp == tp) {
- list[idx].busy_tp = NULL;
- pag->pagb_count--;
+ rbp = pag->pagb_tree.rb_node;
+
+ /* find closest start bno overlap */
+ while (rbp) {
+ busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+ if (bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ if (bno + len > busyp->bno)
+ match = -1;
+ rbp = rbp->rb_left;
+ } else if (bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ if (bno < busyp->bno + busyp->length)
+ match = -1;
+ rbp = rbp->rb_right;
+ } else {
+ /* bno matches busyp, length determines exact match */
+ match = (busyp->length == len) ? 1 : -1;
+ break;
+ }
}
-
spin_unlock(&pag->pagb_lock);
+ trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
xfs_perag_put(pag);
+ return match;
}
-
-/*
- * If we find the extent in the busy list, force the log out to get the
- * extent out of the busy list so the caller can use it straight away.
- */
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
+void
+xfs_alloc_busy_clear(
+ struct xfs_mount *mp,
+ struct xfs_busy_extent *busyp)
{
struct xfs_perag *pag;
- xfs_perag_busy_t *bsy;
- xfs_agblock_t uend, bend;
- xfs_lsn_t lsn = 0;
- int cnt;
- pag = xfs_perag_get(tp->t_mountp, agno);
- spin_lock(&pag->pagb_lock);
- cnt = pag->pagb_count;
+ trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
+ busyp->length);
- /*
- * search pagb_list for this slot, skipping open slots. We have to
- * search the entire array as there may be multiple overlaps and
- * we have to get the most recent LSN for the log force to push out
- * all the transactions that span the range.
- */
- uend = bno + len - 1;
- for (cnt = 0; cnt < pag->pagb_count; cnt++) {
- bsy = &pag->pagb_list[cnt];
- if (!bsy->busy_tp)
- continue;
+ ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
+ busyp->length) == 1);
- bend = bsy->busy_start + bsy->busy_length - 1;
- if (bno > bend || uend < bsy->busy_start)
- continue;
+ list_del_init(&busyp->list);
- /* (start1,length1) within (start2, length2) */
- if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
- lsn = bsy->busy_tp->t_commit_lsn;
- }
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
spin_unlock(&pag->pagb_lock);
xfs_perag_put(pag);
- trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
- /*
- * If a block was found, force the log through the LSN of the
- * transaction that freed the block
- */
- if (lsn)
- xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
+ kmem_free(busyp);
}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa3978..6d05199b667 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
struct xfs_mount;
struct xfs_perag;
struct xfs_trans;
+struct xfs_busy_extent;
/*
* Freespace allocation types. Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
#ifdef __KERNEL__
void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(xfs_trans_t *tp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len);
void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
- xfs_agnumber_t ag,
- int idx);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1..83f49421875 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
* disk. If a busy block is allocated, the iclog is pushed up to the
* LSN that freed the block.
*/
- xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+ xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
return 0;
}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 240340a4727..02a80984aa0 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
nbytes = last - first + 1;
bfset(bip->bli_logged, first, nbytes);
for (x = 0; x < nbytes; x++) {
- chunk_num = byte >> XFS_BLI_SHIFT;
+ chunk_num = byte >> XFS_BLF_SHIFT;
word_num = chunk_num >> BIT_TO_WORD_SHIFT;
bit_num = chunk_num & (NBWORD - 1);
wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
* cancel flag in it.
*/
trace_xfs_buf_item_size_stale(bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
return 1;
}
@@ -197,9 +197,9 @@ xfs_buf_item_size(
} else if (next_bit != last_bit + 1) {
last_bit = next_bit;
nvecs++;
- } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
- (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
- XFS_BLI_CHUNK)) {
+ } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
+ (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
+ XFS_BLF_CHUNK)) {
last_bit = next_bit;
nvecs++;
} else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
vecp++;
nvecs = 1;
+ /*
+ * If it is an inode buffer, transfer the in-memory state to the
+ * format flags and clear the in-memory state. We do not transfer
+ * this state if the inode buffer allocation has not yet been committed
+ * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+ * correct replay of the inode allocation.
+ */
+ if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+ if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+ xfs_log_item_in_current_chkpt(&bip->bli_item)))
+ bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+ bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+ }
+
if (bip->bli_flags & XFS_BLI_STALE) {
/*
* The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
* cancel flag in it.
*/
trace_xfs_buf_item_format_stale(bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
bip->bli_format.blf_size = nvecs;
return;
}
@@ -294,28 +308,28 @@ xfs_buf_item_format(
* keep counting and scanning.
*/
if (next_bit == -1) {
- buffer_offset = first_bit * XFS_BLI_CHUNK;
+ buffer_offset = first_bit * XFS_BLF_CHUNK;
vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLI_CHUNK;
+ vecp->i_len = nbits * XFS_BLF_CHUNK;
vecp->i_type = XLOG_REG_TYPE_BCHUNK;
nvecs++;
break;
} else if (next_bit != last_bit + 1) {
- buffer_offset = first_bit * XFS_BLI_CHUNK;
+ buffer_offset = first_bit * XFS_BLF_CHUNK;
vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLI_CHUNK;
+ vecp->i_len = nbits * XFS_BLF_CHUNK;
vecp->i_type = XLOG_REG_TYPE_BCHUNK;
nvecs++;
vecp++;
first_bit = next_bit;
last_bit = next_bit;
nbits = 1;
- } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
- (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
- XFS_BLI_CHUNK)) {
- buffer_offset = first_bit * XFS_BLI_CHUNK;
+ } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
+ (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
+ XFS_BLF_CHUNK)) {
+ buffer_offset = first_bit * XFS_BLF_CHUNK;
vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLI_CHUNK;
+ vecp->i_len = nbits * XFS_BLF_CHUNK;
vecp->i_type = XLOG_REG_TYPE_BCHUNK;
/* You would think we need to bump the nvecs here too, but we do not
* this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
}
/*
- * This is called to pin the buffer associated with the buf log
- * item in memory so it cannot be written out. Simply call bpin()
- * on the buffer to do this.
+ * This is called to pin the buffer associated with the buf log item in memory
+ * so it cannot be written out. Simply call bpin() on the buffer to do this.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
*/
+
STATIC void
xfs_buf_item_pin(
xfs_buf_log_item_t *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
(bip->bli_flags & XFS_BLI_STALE));
+ atomic_inc(&bip->bli_refcount);
trace_xfs_buf_item_pin(bip);
xfs_bpin(bp);
}
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
ASSERT(XFS_BUF_ISSTALE(bp));
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
trace_xfs_buf_item_unpin_stale(bip);
/*
@@ -489,20 +509,23 @@ xfs_buf_item_trylock(
}
/*
- * Release the buffer associated with the buf log item.
- * If there is no dirty logged data associated with the
- * buffer recorded in the buf log item, then free the
- * buf log item and remove the reference to it in the
- * buffer.
+ * Release the buffer associated with the buf log item. If there is no dirty
+ * logged data associated with the buffer recorded in the buf log item, then
+ * free the buf log item and remove the reference to it in the buffer.
+ *
+ * This call ignores the recursion count. It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
*
- * This call ignores the recursion count. It is only called
- * when the buffer should REALLY be unlocked, regardless
- * of the recursion count.
+ * We unconditionally drop the transaction's reference to the log item. If the
+ * item was logged, then another reference was taken when it was pinned, so we
+ * can safely drop the transaction reference now. This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
*
- * If the XFS_BLI_HOLD flag is set in the buf log item, then
- * free the log item if necessary but do not unlock the buffer.
- * This is for support of xfs_trans_bhold(). Make sure the
- * XFS_BLI_HOLD field is cleared if we don't free the item.
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
+ * if necessary but do not unlock the buffer. This is for support of
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
+ * free the item.
*/
STATIC void
xfs_buf_item_unlock(
@@ -514,73 +537,54 @@ xfs_buf_item_unlock(
bp = bip->bli_buf;
- /*
- * Clear the buffer's association with this transaction.
- */
+ /* Clear the buffer's association with this transaction. */
XFS_BUF_SET_FSPRIVATE2(bp, NULL);
/*
- * If this is a transaction abort, don't return early.
- * Instead, allow the brelse to happen.
- * Normally it would be done for stale (cancelled) buffers
- * at unpin time, but we'll never go through the pin/unpin
- * cycle if we abort inside commit.
+ * If this is a transaction abort, don't return early. Instead, allow
+ * the brelse to happen. Normally it would be done for stale
+ * (cancelled) buffers at unpin time, but we'll never go through the
+ * pin/unpin cycle if we abort inside commit.
*/
aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
/*
- * If the buf item is marked stale, then don't do anything.
- * We'll unlock the buffer and free the buf item when the
- * buffer is unpinned for the last time.
+ * Before possibly freeing the buf item, determine if we should
+ * release the buffer at the end of this routine.
*/
- if (bip->bli_flags & XFS_BLI_STALE) {
- bip->bli_flags &= ~XFS_BLI_LOGGED;
- trace_xfs_buf_item_unlock_stale(bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
- if (!aborted)
- return;
- }
+ hold = bip->bli_flags & XFS_BLI_HOLD;
+
+ /* Clear the per transaction state. */
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
/*
- * Drop the transaction's reference to the log item if
- * it was not logged as part of the transaction. Otherwise
- * we'll drop the reference in xfs_buf_item_unpin() when
- * the transaction is really through with the buffer.
+ * If the buf item is marked stale, then don't do anything. We'll
+ * unlock the buffer and free the buf item when the buffer is unpinned
+ * for the last time.
*/
- if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
- atomic_dec(&bip->bli_refcount);
- } else {
- /*
- * Clear the logged flag since this is per
- * transaction state.
- */
- bip->bli_flags &= ~XFS_BLI_LOGGED;
+ if (bip->bli_flags & XFS_BLI_STALE) {
+ trace_xfs_buf_item_unlock_stale(bip);
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ if (!aborted) {
+ atomic_dec(&bip->bli_refcount);
+ return;
+ }
}
- /*
- * Before possibly freeing the buf item, determine if we should
- * release the buffer at the end of this routine.
- */
- hold = bip->bli_flags & XFS_BLI_HOLD;
trace_xfs_buf_item_unlock(bip);
/*
- * If the buf item isn't tracking any data, free it.
- * Otherwise, if XFS_BLI_HOLD is set clear it.
+ * If the buf item isn't tracking any data, free it, otherwise drop the
+ * reference we hold to it.
*/
if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size)) {
+ bip->bli_format.blf_map_size))
xfs_buf_item_relse(bp);
- } else if (hold) {
- bip->bli_flags &= ~XFS_BLI_HOLD;
- }
+ else
+ atomic_dec(&bip->bli_refcount);
- /*
- * Release the buffer if XFS_BLI_HOLD was not set.
- */
- if (!hold) {
+ if (!hold)
xfs_buf_relse(bp);
- }
}
/*
@@ -717,12 +721,12 @@ xfs_buf_item_init(
}
/*
- * chunks is the number of XFS_BLI_CHUNK size pieces
+ * chunks is the number of XFS_BLF_CHUNK size pieces
* the buffer can be divided into. Make sure not to
* truncate any pieces. map_size is the size of the
* bitmap needed to describe the chunks of the buffer.
*/
- chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+ chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -790,8 +794,8 @@ xfs_buf_item_log(
/*
* Convert byte offsets to bit numbers.
*/
- first_bit = first >> XFS_BLI_SHIFT;
- last_bit = last >> XFS_BLI_SHIFT;
+ first_bit = first >> XFS_BLF_SHIFT;
+ last_bit = last >> XFS_BLF_SHIFT;
/*
* Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index df4454511f7..f20bb472d58 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format {
* This flag indicates that the buffer contains on disk inodes
* and requires special recovery handling.
*/
-#define XFS_BLI_INODE_BUF 0x1
+#define XFS_BLF_INODE_BUF 0x1
/*
* This flag indicates that the buffer should not be replayed
* during recovery because its blocks are being freed.
*/
-#define XFS_BLI_CANCEL 0x2
+#define XFS_BLF_CANCEL 0x2
/*
* This flag indicates that the buffer contains on disk
* user or group dquots and may require special recovery handling.
*/
-#define XFS_BLI_UDQUOT_BUF 0x4
-#define XFS_BLI_PDQUOT_BUF 0x8
-#define XFS_BLI_GDQUOT_BUF 0x10
+#define XFS_BLF_UDQUOT_BUF 0x4
+#define XFS_BLF_PDQUOT_BUF 0x8
+#define XFS_BLF_GDQUOT_BUF 0x10
-#define XFS_BLI_CHUNK 128
-#define XFS_BLI_SHIFT 7
+#define XFS_BLF_CHUNK 128
+#define XFS_BLF_SHIFT 7
#define BIT_TO_WORD_SHIFT 5
#define NBWORD (NBBY * sizeof(unsigned int))
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format {
#define XFS_BLI_LOGGED 0x08
#define XFS_BLI_INODE_ALLOC_BUF 0x10
#define XFS_BLI_STALE_INODE 0x20
+#define XFS_BLI_INODE_BUF 0x40
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format {
{ XFS_BLI_STALE, "STALE" }, \
{ XFS_BLI_LOGGED, "LOGGED" }, \
{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
- { XFS_BLI_STALE_INODE, "STALE_INODE" }
+ { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
+ { XFS_BLI_INODE_BUF, "INODE_BUF" }
#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ef96175c074..047b8a8e5c2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
va_list ap;
#ifdef DEBUG
- xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
+ xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
#endif
if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db90818..75df75f43d4 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -382,9 +382,6 @@ xfs_iget(
/* get the perag structure and ensure that it's inode capable */
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
- if (!pag->pagi_inodeok)
- return EINVAL;
- ASSERT(pag->pag_ici_init);
agino = XFS_INO_TO_AGINO(mp, ino);
again:
@@ -744,30 +741,24 @@ xfs_ilock_demote(
}
#ifdef DEBUG
-/*
- * Debug-only routine, without additional rw_semaphore APIs, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- *
- * Note: this means !xfs_isilocked would give false positives, so don't do that.
- */
int
xfs_isilocked(
xfs_inode_t *ip,
uint lock_flags)
{
- if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) ==
- XFS_ILOCK_EXCL) {
- if (!ip->i_lock.mr_writer)
- return 0;
+ if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+ if (!(lock_flags & XFS_ILOCK_SHARED))
+ return !!ip->i_lock.mr_writer;
+ return rwsem_is_locked(&ip->i_lock.mr_lock);
}
- if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) ==
- XFS_IOLOCK_EXCL) {
- if (!ip->i_iolock.mr_writer)
- return 0;
+ if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+ if (!(lock_flags & XFS_IOLOCK_SHARED))
+ return !!ip->i_iolock.mr_writer;
+ return rwsem_is_locked(&ip->i_iolock.mr_lock);
}
- return 1;
+ ASSERT(0);
+ return 0;
}
#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8cd6e8d8fe9..d53c39de7d0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster(
int blks_per_cluster;
int nbufs;
int ninodes;
- int i, j, found, pre_flushed;
+ int i, j;
xfs_daddr_t blkno;
xfs_buf_t *bp;
- xfs_inode_t *ip, **ip_found;
+ xfs_inode_t *ip;
xfs_inode_log_item_t *iip;
xfs_log_item_t *lip;
struct xfs_perag *pag;
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster(
nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
}
- ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
-
for (j = 0; j < nbufs; j++, inum += ninodes) {
+ int found = 0;
+
blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
XFS_INO_TO_AGBNO(mp, inum));
+ /*
+ * We obtain and lock the backing buffer first in the process
+ * here, as we have to ensure that any dirty inode that we
+ * can't get the flush lock on is attached to the buffer.
+ * If we scan the in-memory inodes first, then buffer IO can
+ * complete before we get a lock on it, and hence we may fail
+ * to mark all the active inodes on the buffer stale.
+ */
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+ mp->m_bsize * blks_per_cluster,
+ XBF_LOCK);
+
+ /*
+ * Walk the inodes already attached to the buffer and mark them
+ * stale. These will all have the flush locks held, so an
+ * in-memory inode walk can't lock them.
+ */
+ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ while (lip) {
+ if (lip->li_type == XFS_LI_INODE) {
+ iip = (xfs_inode_log_item_t *)lip;
+ ASSERT(iip->ili_logged == 1);
+ lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+ xfs_trans_ail_copy_lsn(mp->m_ail,
+ &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
+ xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+ found++;
+ }
+ lip = lip->li_bio_list;
+ }
/*
- * Look for each inode in memory and attempt to lock it,
- * we can be racing with flush and tail pushing here.
- * any inode we get the locks on, add to an array of
- * inode items to process later.
+ * For each inode in memory attempt to add it to the inode
+ * buffer and set it up for being staled on buffer IO
+ * completion. This is safe as we've locked out tail pushing
+ * and flushing by locking the buffer.
*
- * The get the buffer lock, we could beat a flush
- * or tail pushing thread to the lock here, in which
- * case they will go looking for the inode buffer
- * and fail, we need some other form of interlock
- * here.
+ * We have already marked every inode that was part of a
+ * transaction stale above, which means there is no point in
+ * even trying to lock them.
*/
- found = 0;
for (i = 0; i < ninodes; i++) {
read_lock(&pag->pag_ici_lock);
ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i)));
- /* Inode not in memory or we found it already,
- * nothing to do
- */
+ /* Inode not in memory or stale, nothing to do */
if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
read_unlock(&pag->pag_ici_lock);
continue;
}
- if (xfs_inode_clean(ip)) {
- read_unlock(&pag->pag_ici_lock);
- continue;
- }
-
- /* If we can get the locks then add it to the
- * list, otherwise by the time we get the bp lock
- * below it will already be attached to the
- * inode buffer.
- */
-
- /* This inode will already be locked - by us, lets
- * keep it that way.
- */
-
- if (ip == free_ip) {
- if (xfs_iflock_nowait(ip)) {
- xfs_iflags_set(ip, XFS_ISTALE);
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- } else {
- ip_found[found++] = ip;
- }
- }
+ /* don't try to lock/unlock the current inode */
+ if (ip != free_ip &&
+ !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
read_unlock(&pag->pag_ici_lock);
continue;
}
+ read_unlock(&pag->pag_ici_lock);
- if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- if (xfs_iflock_nowait(ip)) {
- xfs_iflags_set(ip, XFS_ISTALE);
-
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- } else {
- ip_found[found++] = ip;
- }
- } else {
+ if (!xfs_iflock_nowait(ip)) {
+ if (ip != free_ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+ continue;
}
- read_unlock(&pag->pag_ici_lock);
- }
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * blks_per_cluster,
- XBF_LOCK);
-
- pre_flushed = 0;
- lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
- while (lip) {
- if (lip->li_type == XFS_LI_INODE) {
- iip = (xfs_inode_log_item_t *)lip;
- ASSERT(iip->ili_logged == 1);
- lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
- xfs_trans_ail_copy_lsn(mp->m_ail,
- &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
- xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
- pre_flushed++;
+ xfs_iflags_set(ip, XFS_ISTALE);
+ if (xfs_inode_clean(ip)) {
+ ASSERT(ip != free_ip);
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ continue;
}
- lip = lip->li_bio_list;
- }
- for (i = 0; i < found; i++) {
- ip = ip_found[i];
iip = ip->i_itemp;
-
if (!iip) {
+ /* inode with unlogged changes only */
+ ASSERT(ip != free_ip);
ip->i_update_core = 0;
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
+ found++;
iip->ili_last_fields = iip->ili_format.ilf_fields;
iip->ili_format.ilf_fields = 0;
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster(
xfs_buf_attach_iodone(bp,
(void(*)(xfs_buf_t*,xfs_log_item_t*))
xfs_istale_done, (xfs_log_item_t *)iip);
- if (ip != free_ip) {
+
+ if (ip != free_ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
}
- if (found || pre_flushed)
+ if (found)
xfs_trans_stale_inode_buf(tp, bp);
xfs_trans_binval(tp, bp);
}
- kmem_free(ip_found);
xfs_perag_put(pag);
}
@@ -2649,8 +2631,6 @@ xfs_iflush_cluster(
int i;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- ASSERT(pag->pagi_inodeok);
- ASSERT(pag->pag_ici_init);
inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3038dd52c72..5215abc8023 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -54,9 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
STATIC void xlog_dealloc_log(xlog_t *log);
-STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
- struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
- xlog_in_core_t **commit_iclog, uint flags);
/* local state machine functions */
STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log,
STATIC void xlog_ungrant_log_space(xlog_t *log,
xlog_ticket_t *ticket);
-
-/* local ticket functions */
-STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
- int unit_bytes,
- int count,
- char clientid,
- uint flags);
-
#if defined(DEBUG)
STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
@@ -360,6 +349,15 @@ xfs_log_reserve(
ASSERT(flags & XFS_LOG_PERM_RESERV);
internal_ticket = *ticket;
+ /*
+ * this is a new transaction on the ticket, so we need to
+ * change the transaction ID so that the next transaction has a
+ * different TID in the log. Just add one to the existing tid
+ * so that we can see chains of rolling transactions in the log
+ * easily.
+ */
+ internal_ticket->t_tid++;
+
trace_xfs_log_reserve(log, internal_ticket);
xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -367,7 +365,8 @@ xfs_log_reserve(
} else {
/* may sleep if need to allocate more tickets */
internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
- client, flags);
+ client, flags,
+ KM_SLEEP|KM_MAYFAIL);
if (!internal_ticket)
return XFS_ERROR(ENOMEM);
internal_ticket->t_trans_type = t_type;
@@ -452,6 +451,13 @@ xfs_log_mount(
/* Normal transactions can now occur */
mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+ /*
+ * Now the log has been fully initialised and we know were our
+ * space grant counters are, we can initialise the permanent ticket
+ * needed for delayed logging to work.
+ */
+ xlog_cil_init_post_recovery(mp->m_log);
+
return 0;
out_destroy_ail:
@@ -658,6 +664,10 @@ xfs_log_item_init(
item->li_ailp = mp->m_ail;
item->li_type = type;
item->li_ops = ops;
+ item->li_lv = NULL;
+
+ INIT_LIST_HEAD(&item->li_ail);
+ INIT_LIST_HEAD(&item->li_cil);
}
/*
@@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp,
*iclogp = log->l_iclog; /* complete ring */
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
+ error = xlog_cil_init(log);
+ if (error)
+ goto out_free_iclog;
return log;
out_free_iclog:
@@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
xlog_in_core_t *iclog, *next_iclog;
int i;
+ xlog_cil_destroy(log);
+
iclog = log->l_iclog;
for (i=0; i<log->l_iclog_bufs; i++) {
sv_destroy(&iclog->ic_force_wait);
@@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log,
* print out info relating to regions written which consume
* the reservation
*/
-STATIC void
-xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
+void
+xlog_print_tic_res(
+ struct xfs_mount *mp,
+ struct xlog_ticket *ticket)
{
uint i;
uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
"bad-rtype" : res_type_str[r_type-1]),
ticket->t_res_arr[i].r_len);
}
+
+ xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+ "xfs_log_write: reservation ran out. Need to up reservation");
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
/*
@@ -1865,7 +1886,7 @@ xlog_write_copy_finish(
* we don't update ic_offset until the end when we know exactly how many
* bytes have been written out.
*/
-STATIC int
+int
xlog_write(
struct log *log,
struct xfs_log_vec *log_vector,
@@ -1889,22 +1910,26 @@ xlog_write(
*start_lsn = 0;
len = xlog_write_calc_vec_length(ticket, log_vector);
- if (ticket->t_curr_res < len) {
- xlog_print_tic_res(log->l_mp, ticket);
-#ifdef DEBUG
- xlog_panic(
- "xfs_log_write: reservation ran out. Need to up reservation");
-#else
- /* Customer configurable panic */
- xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp,
- "xfs_log_write: reservation ran out. Need to up reservation");
+ if (log->l_cilp) {
+ /*
+ * Region headers and bytes are already accounted for.
+ * We only need to take into account start records and
+ * split regions in this function.
+ */
+ if (ticket->t_flags & XLOG_TIC_INITED)
+ ticket->t_curr_res -= sizeof(xlog_op_header_t);
- /* If we did not panic, shutdown the filesystem */
- xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE);
-#endif
- }
+ /*
+ * Commit record headers need to be accounted for. These
+ * come in as separate writes so are easy to detect.
+ */
+ if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+ ticket->t_curr_res -= sizeof(xlog_op_header_t);
+ } else
+ ticket->t_curr_res -= len;
- ticket->t_curr_res -= len;
+ if (ticket->t_curr_res < 0)
+ xlog_print_tic_res(log->l_mp, ticket);
index = 0;
lv = log_vector;
@@ -3000,6 +3025,8 @@ _xfs_log_force(
XFS_STATS_INC(xs_log_force);
+ xlog_cil_push(log, 1);
+
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
@@ -3149,6 +3176,12 @@ _xfs_log_force_lsn(
XFS_STATS_INC(xs_log_force);
+ if (log->l_cilp) {
+ lsn = xlog_cil_push_lsn(log, lsn);
+ if (lsn == NULLCOMMITLSN)
+ return 0;
+ }
+
try_again:
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
@@ -3313,22 +3346,30 @@ xfs_log_ticket_get(
return ticket;
}
+xlog_tid_t
+xfs_log_get_trans_ident(
+ struct xfs_trans *tp)
+{
+ return tp->t_ticket->t_tid;
+}
+
/*
* Allocate and initialise a new log ticket.
*/
-STATIC xlog_ticket_t *
+xlog_ticket_t *
xlog_ticket_alloc(
struct log *log,
int unit_bytes,
int cnt,
char client,
- uint xflags)
+ uint xflags,
+ int alloc_flags)
{
struct xlog_ticket *tic;
uint num_headers;
int iclog_space;
- tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+ tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
if (!tic)
return NULL;
@@ -3647,6 +3688,11 @@ xlog_state_ioerror(
* c. nothing new gets queued up after (a) and (b) are done.
* d. if !logerror, flush the iclogs to disk, then seal them off
* for business.
+ *
+ * Note: for delayed logging the !logerror case needs to flush the regions
+ * held in memory out to the iclogs before flushing them to disk. This needs
+ * to be done before the log is marked as shutdown, otherwise the flush to the
+ * iclogs will fail.
*/
int
xfs_log_force_umount(
@@ -3680,6 +3726,16 @@ xfs_log_force_umount(
return 1;
}
retval = 0;
+
+ /*
+ * Flush the in memory commit item list before marking the log as
+ * being shut down. We need to do it in this order to ensure all the
+ * completed transactions are flushed to disk with the xfs_log_force()
+ * call below.
+ */
+ if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+ xlog_cil_push(log, 1);
+
/*
* We must hold both the GRANT lock and the LOG lock,
* before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 229d1f36ba9..04c78e642cc 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
#define __XFS_LOG_H__
/* get lsn fields */
-
#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
#define BLOCK_LSN(lsn) ((uint)(lsn))
@@ -114,6 +113,9 @@ struct xfs_log_vec {
struct xfs_log_vec *lv_next; /* next lv in build list */
int lv_niovecs; /* number of iovecs in lv */
struct xfs_log_iovec *lv_iovecp; /* iovec array */
+ struct xfs_log_item *lv_item; /* owner */
+ char *lv_buf; /* formatted buffer */
+ int lv_buf_len; /* size of formatted buffer */
};
/*
@@ -134,6 +136,7 @@ struct xlog_in_core;
struct xlog_ticket;
struct xfs_log_item;
struct xfs_item_ops;
+struct xfs_trans;
void xfs_log_item_init(struct xfs_mount *mp,
struct xfs_log_item *item,
@@ -187,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp);
void xlog_iodone(struct xfs_buf *);
-struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
+xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
+
+int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct xfs_log_vec *log_vector,
+ xfs_lsn_t *commit_lsn, int flags);
+bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+
#endif
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 00000000000..bb17cc044bf
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+
+/*
+ * Perform initial CIL structure initialisation. If the CIL is not
+ * enabled in this filesystem, ensure the log->l_cilp is null so
+ * we can check this conditional to determine if we are doing delayed
+ * logging or not.
+ */
+int
+xlog_cil_init(
+ struct log *log)
+{
+ struct xfs_cil *cil;
+ struct xfs_cil_ctx *ctx;
+
+ log->l_cilp = NULL;
+ if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
+ return 0;
+
+ cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+ if (!cil)
+ return ENOMEM;
+
+ ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+ if (!ctx) {
+ kmem_free(cil);
+ return ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&cil->xc_cil);
+ INIT_LIST_HEAD(&cil->xc_committing);
+ spin_lock_init(&cil->xc_cil_lock);
+ init_rwsem(&cil->xc_ctx_lock);
+ sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+
+ INIT_LIST_HEAD(&ctx->committing);
+ INIT_LIST_HEAD(&ctx->busy_extents);
+ ctx->sequence = 1;
+ ctx->cil = cil;
+ cil->xc_ctx = ctx;
+
+ cil->xc_log = log;
+ log->l_cilp = cil;
+ return 0;
+}
+
+void
+xlog_cil_destroy(
+ struct log *log)
+{
+ if (!log->l_cilp)
+ return;
+
+ if (log->l_cilp->xc_ctx) {
+ if (log->l_cilp->xc_ctx->ticket)
+ xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+ kmem_free(log->l_cilp->xc_ctx);
+ }
+
+ ASSERT(list_empty(&log->l_cilp->xc_cil));
+ kmem_free(log->l_cilp);
+}
+
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+ struct log *log)
+{
+ struct xlog_ticket *tic;
+
+ tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+ KM_SLEEP|KM_NOFS);
+ tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+
+ /*
+ * set the current reservation to zero so we know to steal the basic
+ * transaction overhead reservation from the first transaction commit.
+ */
+ tic->t_curr_res = 0;
+ return tic;
+}
+
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push. This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+ struct log *log)
+{
+ if (!log->l_cilp)
+ return;
+
+ log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+ log->l_cilp->xc_ctx->sequence = 1;
+ log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
+ log->l_curr_block);
+}
+
+/*
+ * Insert the log item into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ *
+ * If this is the first time the item is being placed into the CIL in this
+ * context, pin it so it can't be written to disk until the CIL is flushed to
+ * the iclog and the iclog written to disk.
+ */
+static void
+xlog_cil_insert(
+ struct log *log,
+ struct xlog_ticket *ticket,
+ struct xfs_log_item *item,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ struct xfs_log_vec *old = lv->lv_item->li_lv;
+ struct xfs_cil_ctx *ctx = cil->xc_ctx;
+ int len;
+ int diff_iovecs;
+ int iclog_space;
+
+ if (old) {
+ /* existing lv on log item, space used is a delta */
+ ASSERT(!list_empty(&item->li_cil));
+ ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+
+ len = lv->lv_buf_len - old->lv_buf_len;
+ diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
+ kmem_free(old->lv_buf);
+ kmem_free(old);
+ } else {
+ /* new lv, must pin the log item */
+ ASSERT(!lv->lv_item->li_lv);
+ ASSERT(list_empty(&item->li_cil));
+
+ len = lv->lv_buf_len;
+ diff_iovecs = lv->lv_niovecs;
+ IOP_PIN(lv->lv_item);
+
+ }
+ len += diff_iovecs * sizeof(xlog_op_header_t);
+
+ /* attach new log vector to log item */
+ lv->lv_item->li_lv = lv;
+
+ spin_lock(&cil->xc_cil_lock);
+ list_move_tail(&item->li_cil, &cil->xc_cil);
+ ctx->nvecs += diff_iovecs;
+
+ /*
+ * If this is the first time the item is being committed to the CIL,
+ * store the sequence number on the log item so we can tell
+ * in future commits whether this is the first checkpoint the item is
+ * being committed into.
+ */
+ if (!item->li_seq)
+ item->li_seq = ctx->sequence;
+
+ /*
+ * Now transfer enough transaction reservation to the context ticket
+ * for the checkpoint. The context ticket is special - the unit
+ * reservation has to grow as well as the current reservation as we
+ * steal from tickets so we can correctly determine the space used
+ * during the transaction commit.
+ */
+ if (ctx->ticket->t_curr_res == 0) {
+ /* first commit in checkpoint, steal the header reservation */
+ ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+ ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+ ticket->t_curr_res -= ctx->ticket->t_unit_res;
+ }
+
+ /* do we need space for more log record headers? */
+ iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+ if (len > 0 && (ctx->space_used / iclog_space !=
+ (ctx->space_used + len) / iclog_space)) {
+ int hdrs;
+
+ hdrs = (len + iclog_space - 1) / iclog_space;
+ /* need to take into account split region headers, too */
+ hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+ ctx->ticket->t_unit_res += hdrs;
+ ctx->ticket->t_curr_res += hdrs;
+ ticket->t_curr_res -= hdrs;
+ ASSERT(ticket->t_curr_res >= len);
+ }
+ ticket->t_curr_res -= len;
+ ctx->space_used += len;
+
+ spin_unlock(&cil->xc_cil_lock);
+}
+
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer. Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_format_items(
+ struct log *log,
+ struct xfs_log_vec *log_vector,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *start_lsn)
+{
+ struct xfs_log_vec *lv;
+
+ if (start_lsn)
+ *start_lsn = log->l_cilp->xc_ctx->sequence;
+
+ ASSERT(log_vector);
+ for (lv = log_vector; lv; lv = lv->lv_next) {
+ void *ptr;
+ int index;
+ int len = 0;
+
+ /* build the vector array and calculate it's length */
+ IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
+ for (index = 0; index < lv->lv_niovecs; index++)
+ len += lv->lv_iovecp[index].i_len;
+
+ lv->lv_buf_len = len;
+ lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+ ptr = lv->lv_buf;
+
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+
+ memcpy(ptr, vec->i_addr, vec->i_len);
+ vec->i_addr = ptr;
+ ptr += vec->i_len;
+ }
+ ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+
+ xlog_cil_insert(log, ticket, lv->lv_item, lv);
+ }
+}
+
+static void
+xlog_cil_free_logvec(
+ struct xfs_log_vec *log_vector)
+{
+ struct xfs_log_vec *lv;
+
+ for (lv = log_vector; lv; ) {
+ struct xfs_log_vec *next = lv->lv_next;
+ kmem_free(lv->lv_buf);
+ kmem_free(lv);
+ lv = next;
+ }
+}
+
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_log_vec *log_vector,
+ xfs_lsn_t *commit_lsn,
+ int flags)
+{
+ struct log *log = mp->m_log;
+ int log_flags = 0;
+ int push = 0;
+
+ if (flags & XFS_TRANS_RELEASE_LOG_RES)
+ log_flags = XFS_LOG_REL_PERM_RESERV;
+
+ if (XLOG_FORCED_SHUTDOWN(log)) {
+ xlog_cil_free_logvec(log_vector);
+ return XFS_ERROR(EIO);
+ }
+
+ /* lock out background commit */
+ down_read(&log->l_cilp->xc_ctx_lock);
+ xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
+
+ /* check we didn't blow the reservation */
+ if (tp->t_ticket->t_curr_res < 0)
+ xlog_print_tic_res(log->l_mp, tp->t_ticket);
+
+ /* attach the transaction to the CIL if it has any busy extents */
+ if (!list_empty(&tp->t_busy)) {
+ spin_lock(&log->l_cilp->xc_cil_lock);
+ list_splice_init(&tp->t_busy,
+ &log->l_cilp->xc_ctx->busy_extents);
+ spin_unlock(&log->l_cilp->xc_cil_lock);
+ }
+
+ tp->t_commit_lsn = *commit_lsn;
+ xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ xfs_trans_unreserve_and_mod_sb(tp);
+
+ /* check for background commit before unlock */
+ if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+ push = 1;
+ up_read(&log->l_cilp->xc_ctx_lock);
+
+ /*
+ * We need to push CIL every so often so we don't cache more than we
+ * can fit in the log. The limit really is that a checkpoint can't be
+ * more than half the log (the current checkpoint is not allowed to
+ * overwrite the previous checkpoint), but commit latency and memory
+ * usage limit this to a smaller size in most cases.
+ */
+ if (push)
+ xlog_cil_push(log, 0);
+ return 0;
+}
+
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+ void *args,
+ int abort)
+{
+ struct xfs_cil_ctx *ctx = args;
+ struct xfs_log_vec *lv;
+ int abortflag = abort ? XFS_LI_ABORTED : 0;
+ struct xfs_busy_extent *busyp, *n;
+
+ /* unpin all the log items */
+ for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+ xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
+ abortflag);
+ }
+
+ list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+ xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+
+ spin_lock(&ctx->cil->xc_cil_lock);
+ list_del(&ctx->committing);
+ spin_unlock(&ctx->cil->xc_cil_lock);
+
+ xlog_cil_free_logvec(ctx->lv_chain);
+ kmem_free(ctx);
+}
+
+/*
+ * Push the Committed Item List to the log. If the push_now flag is not set,
+ * then it is a background flush and so we can chose to ignore it.
+ */
+int
+xlog_cil_push(
+ struct log *log,
+ int push_now)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ struct xfs_log_vec *lv;
+ struct xfs_cil_ctx *ctx;
+ struct xfs_cil_ctx *new_ctx;
+ struct xlog_in_core *commit_iclog;
+ struct xlog_ticket *tic;
+ int num_lv;
+ int num_iovecs;
+ int len;
+ int error = 0;
+ struct xfs_trans_header thdr;
+ struct xfs_log_iovec lhdr;
+ struct xfs_log_vec lvhdr = { NULL };
+ xfs_lsn_t commit_lsn;
+
+ if (!cil)
+ return 0;
+
+ new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+ new_ctx->ticket = xlog_cil_ticket_alloc(log);
+
+ /* lock out transaction commit, but don't block on background push */
+ if (!down_write_trylock(&cil->xc_ctx_lock)) {
+ if (!push_now)
+ goto out_free_ticket;
+ down_write(&cil->xc_ctx_lock);
+ }
+ ctx = cil->xc_ctx;
+
+ /* check if we've anything to push */
+ if (list_empty(&cil->xc_cil))
+ goto out_skip;
+
+ /* check for spurious background flush */
+ if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ goto out_skip;
+
+ /*
+ * pull all the log vectors off the items in the CIL, and
+ * remove the items from the CIL. We don't need the CIL lock
+ * here because it's only needed on the transaction commit
+ * side which is currently locked out by the flush lock.
+ */
+ lv = NULL;
+ num_lv = 0;
+ num_iovecs = 0;
+ len = 0;
+ while (!list_empty(&cil->xc_cil)) {
+ struct xfs_log_item *item;
+ int i;
+
+ item = list_first_entry(&cil->xc_cil,
+ struct xfs_log_item, li_cil);
+ list_del_init(&item->li_cil);
+ if (!ctx->lv_chain)
+ ctx->lv_chain = item->li_lv;
+ else
+ lv->lv_next = item->li_lv;
+ lv = item->li_lv;
+ item->li_lv = NULL;
+
+ num_lv++;
+ num_iovecs += lv->lv_niovecs;
+ for (i = 0; i < lv->lv_niovecs; i++)
+ len += lv->lv_iovecp[i].i_len;
+ }
+
+ /*
+ * initialise the new context and attach it to the CIL. Then attach
+ * the current context to the CIL committing lsit so it can be found
+ * during log forces to extract the commit lsn of the sequence that
+ * needs to be forced.
+ */
+ INIT_LIST_HEAD(&new_ctx->committing);
+ INIT_LIST_HEAD(&new_ctx->busy_extents);
+ new_ctx->sequence = ctx->sequence + 1;
+ new_ctx->cil = cil;
+ cil->xc_ctx = new_ctx;
+
+ /*
+ * The switch is now done, so we can drop the context lock and move out
+ * of a shared context. We can't just go straight to the commit record,
+ * though - we need to synchronise with previous and future commits so
+ * that the commit records are correctly ordered in the log to ensure
+ * that we process items during log IO completion in the correct order.
+ *
+ * For example, if we get an EFI in one checkpoint and the EFD in the
+ * next (e.g. due to log forces), we do not want the checkpoint with
+ * the EFD to be committed before the checkpoint with the EFI. Hence
+ * we must strictly order the commit records of the checkpoints so
+ * that: a) the checkpoint callbacks are attached to the iclogs in the
+ * correct order; and b) the checkpoints are replayed in correct order
+ * in log recovery.
+ *
+ * Hence we need to add this context to the committing context list so
+ * that higher sequences will wait for us to write out a commit record
+ * before they do.
+ */
+ spin_lock(&cil->xc_cil_lock);
+ list_add(&ctx->committing, &cil->xc_committing);
+ spin_unlock(&cil->xc_cil_lock);
+ up_write(&cil->xc_ctx_lock);
+
+ /*
+ * Build a checkpoint transaction header and write it to the log to
+ * begin the transaction. We need to account for the space used by the
+ * transaction header here as it is not accounted for in xlog_write().
+ *
+ * The LSN we need to pass to the log items on transaction commit is
+ * the LSN reported by the first log vector write. If we use the commit
+ * record lsn then we can move the tail beyond the grant write head.
+ */
+ tic = ctx->ticket;
+ thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+ thdr.th_type = XFS_TRANS_CHECKPOINT;
+ thdr.th_tid = tic->t_tid;
+ thdr.th_num_items = num_iovecs;
+ lhdr.i_addr = (xfs_caddr_t)&thdr;
+ lhdr.i_len = sizeof(xfs_trans_header_t);
+ lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+ tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+
+ lvhdr.lv_niovecs = 1;
+ lvhdr.lv_iovecp = &lhdr;
+ lvhdr.lv_next = ctx->lv_chain;
+
+ error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+ if (error)
+ goto out_abort;
+
+ /*
+ * now that we've written the checkpoint into the log, strictly
+ * order the commit records so replay will get them in the right order.
+ */
+restart:
+ spin_lock(&cil->xc_cil_lock);
+ list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+ /*
+ * Higher sequences will wait for this one so skip them.
+ * Don't wait for own own sequence, either.
+ */
+ if (new_ctx->sequence >= ctx->sequence)
+ continue;
+ if (!new_ctx->commit_lsn) {
+ /*
+ * It is still being pushed! Wait for the push to
+ * complete, then start again from the beginning.
+ */
+ sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+ goto restart;
+ }
+ }
+ spin_unlock(&cil->xc_cil_lock);
+
+ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+ if (error || commit_lsn == -1)
+ goto out_abort;
+
+ /* attach all the transactions w/ busy extents to iclog */
+ ctx->log_cb.cb_func = xlog_cil_committed;
+ ctx->log_cb.cb_arg = ctx;
+ error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+ if (error)
+ goto out_abort;
+
+ /*
+ * now the checkpoint commit is complete and we've attached the
+ * callbacks to the iclog we can assign the commit LSN to the context
+ * and wake up anyone who is waiting for the commit to complete.
+ */
+ spin_lock(&cil->xc_cil_lock);
+ ctx->commit_lsn = commit_lsn;
+ sv_broadcast(&cil->xc_commit_wait);
+ spin_unlock(&cil->xc_cil_lock);
+
+ /* release the hounds! */
+ return xfs_log_release_iclog(log->l_mp, commit_iclog);
+
+out_skip:
+ up_write(&cil->xc_ctx_lock);
+out_free_ticket:
+ xfs_log_ticket_put(new_ctx->ticket);
+ kmem_free(new_ctx);
+ return 0;
+
+out_abort:
+ xlog_cil_committed(ctx, XFS_LI_ABORTED);
+ return XFS_ERROR(EIO);
+}
+
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ *
+ * XXX: Initially, just push the CIL unconditionally and return whatever
+ * commit lsn is there. It'll be empty, so this is broken for now.
+ */
+xfs_lsn_t
+xlog_cil_push_lsn(
+ struct log *log,
+ xfs_lsn_t push_seq)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ struct xfs_cil_ctx *ctx;
+ xfs_lsn_t commit_lsn = NULLCOMMITLSN;
+
+restart:
+ down_write(&cil->xc_ctx_lock);
+ ASSERT(push_seq <= cil->xc_ctx->sequence);
+
+ /* check to see if we need to force out the current context */
+ if (push_seq == cil->xc_ctx->sequence) {
+ up_write(&cil->xc_ctx_lock);
+ xlog_cil_push(log, 1);
+ goto restart;
+ }
+
+ /*
+ * See if we can find a previous sequence still committing.
+ * We can drop the flush lock as soon as we have the cil lock
+ * because we are now only comparing contexts protected by
+ * the cil lock.
+ *
+ * We need to wait for all previous sequence commits to complete
+ * before allowing the force of push_seq to go ahead. Hence block
+ * on commits for those as well.
+ */
+ spin_lock(&cil->xc_cil_lock);
+ up_write(&cil->xc_ctx_lock);
+ list_for_each_entry(ctx, &cil->xc_committing, committing) {
+ if (ctx->sequence > push_seq)
+ continue;
+ if (!ctx->commit_lsn) {
+ /*
+ * It is still being pushed! Wait for the push to
+ * complete, then start again from the beginning.
+ */
+ sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+ goto restart;
+ }
+ if (ctx->sequence != push_seq)
+ continue;
+ /* found it! */
+ commit_lsn = ctx->commit_lsn;
+ }
+ spin_unlock(&cil->xc_cil_lock);
+ return commit_lsn;
+}
+
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+ struct xfs_log_item *lip)
+{
+ struct xfs_cil_ctx *ctx;
+
+ if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
+ return false;
+ if (list_empty(&lip->li_cil))
+ return false;
+
+ ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+
+ /*
+ * li_seq is written on the first commit of a log item to record the
+ * first checkpoint it is written to. Hence if it is different to the
+ * current sequence, we're in a new checkpoint.
+ */
+ if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+ return false;
+ return true;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9cf69515445..8c072618965 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
shutdown */
-typedef __uint32_t xlog_tid_t;
-
#ifdef __KERNEL__
/*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
} xlog_in_core_t;
/*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing. After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+
+struct xfs_cil_ctx {
+ struct xfs_cil *cil;
+ xfs_lsn_t sequence; /* chkpt sequence # */
+ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
+ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
+ struct xlog_ticket *ticket; /* chkpt ticket */
+ int nvecs; /* number of regions */
+ int space_used; /* aggregate size of regions */
+ struct list_head busy_extents; /* busy extents in chkpt */
+ struct xfs_log_vec *lv_chain; /* logvecs being pushed */
+ xfs_log_callback_t log_cb; /* completion callback hook. */
+ struct list_head committing; /* ctx committing list */
+};
+
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+ struct log *xc_log;
+ struct list_head xc_cil;
+ spinlock_t xc_cil_lock;
+ struct xfs_cil_ctx *xc_ctx;
+ struct rw_semaphore xc_ctx_lock;
+ struct list_head xc_committing;
+ sv_t xc_commit_wait;
+};
+
+/*
+ * The amount of log space we should the CIL to aggregate is difficult to size.
+ * Whatever we chose we have to make we can get a reservation for the log space
+ * effectively, that it is large enough to capture sufficient relogging to
+ * reduce log buffer IO significantly, but it is not too large for the log or
+ * induces too much latency when writing out through the iclogs. We track both
+ * space consumed and the number of vectors in the checkpoint context, so we
+ * need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers. The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can basically make up arbitrary limits for the
+ * checkpoint size so long as they don't violate any other size rules. Hence
+ * the initial maximum size for the checkpoint transaction will be set to a
+ * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * right now based on the latency of writing out a large amount of data through
+ * the circular iclog buffers.
+ */
+
+#define XLOG_CIL_SPACE_LIMIT(log) \
+ (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
+
+/*
* The reservation head lsn is not made up of a cycle number and block number.
* Instead, it uses a cycle number and byte number. Logs don't expect to
* overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
/* The following fields don't need locking */
struct xfs_mount *l_mp; /* mount point */
struct xfs_ail *l_ailp; /* AIL log is working with */
+ struct xfs_cil *l_cilp; /* CIL log is working with */
struct xfs_buf *l_xbuf; /* extra buffer for log
* wrapping */
struct xfs_buftarg *l_targ; /* buftarg of log */
@@ -438,14 +530,17 @@ typedef struct log {
#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
-
/* common routines */
extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
extern int xlog_recover(xlog_t *log);
extern int xlog_recover_finish(xlog_t *log);
extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern kmem_zone_t *xfs_log_ticket_zone;
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
+ int count, char client, uint xflags,
+ int alloc_flags);
+
static inline void
xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
@@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
*off += bytes;
}
+void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+ struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+ xlog_in_core_t **commit_iclog, uint flags);
+
+/*
+ * Committed Item List interfaces
+ */
+int xlog_cil_init(struct log *log);
+void xlog_cil_init_post_recovery(struct log *log);
+void xlog_cil_destroy(struct log *log);
+
+int xlog_cil_push(struct log *log, int push_now);
+xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
+
/*
* Unmount record type is used as a pseudo transaction type for the ticket.
* It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0de08e36631..ed0684cc50e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -132,15 +132,10 @@ xlog_align(
int nbblks,
xfs_buf_t *bp)
{
- xfs_daddr_t offset;
- xfs_caddr_t ptr;
+ xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
- offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
- ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
-
- ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
-
- return ptr;
+ ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
+ return XFS_BUF_PTR(bp) + BBTOB(offset);
}
@@ -1576,7 +1571,7 @@ xlog_recover_reorder_trans(
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
- if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+ if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
trace_xfs_log_recover_item_reorder_head(log,
trans, item, pass);
list_move(&item->ri_list, &trans->r_itemq);
@@ -1638,7 +1633,7 @@ xlog_recover_do_buffer_pass1(
/*
* If this isn't a cancel buffer item, then just return.
*/
- if (!(flags & XFS_BLI_CANCEL)) {
+ if (!(flags & XFS_BLF_CANCEL)) {
trace_xfs_log_recover_buf_not_cancel(log, buf_f);
return;
}
@@ -1696,7 +1691,7 @@ xlog_recover_do_buffer_pass1(
* Check to see whether the buffer being recovered has a corresponding
* entry in the buffer cancel record table. If it does then return 1
* so that it will be cancelled, otherwise return 0. If the buffer is
- * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
* the refcount on the entry in the table and remove it from the table
* if this is the last reference.
*
@@ -1721,7 +1716,7 @@ xlog_check_buffer_cancelled(
* There is nothing in the table built in pass one,
* so this buffer must not be cancelled.
*/
- ASSERT(!(flags & XFS_BLI_CANCEL));
+ ASSERT(!(flags & XFS_BLF_CANCEL));
return 0;
}
@@ -1733,7 +1728,7 @@ xlog_check_buffer_cancelled(
* There is no corresponding entry in the table built
* in pass one, so this buffer has not been cancelled.
*/
- ASSERT(!(flags & XFS_BLI_CANCEL));
+ ASSERT(!(flags & XFS_BLF_CANCEL));
return 0;
}
@@ -1752,7 +1747,7 @@ xlog_check_buffer_cancelled(
* one in the table and remove it if this is the
* last reference.
*/
- if (flags & XFS_BLI_CANCEL) {
+ if (flags & XFS_BLF_CANCEL) {
bcp->bc_refcount--;
if (bcp->bc_refcount == 0) {
if (prevp == NULL) {
@@ -1772,7 +1767,7 @@ xlog_check_buffer_cancelled(
* We didn't find a corresponding entry in the table, so
* return 0 so that the buffer is NOT cancelled.
*/
- ASSERT(!(flags & XFS_BLI_CANCEL));
+ ASSERT(!(flags & XFS_BLF_CANCEL));
return 0;
}
@@ -1874,8 +1869,8 @@ xlog_recover_do_inode_buffer(
nbits = xfs_contig_bits(data_map, map_size,
bit);
ASSERT(nbits > 0);
- reg_buf_offset = bit << XFS_BLI_SHIFT;
- reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+ reg_buf_offset = bit << XFS_BLF_SHIFT;
+ reg_buf_bytes = nbits << XFS_BLF_SHIFT;
item_index++;
}
@@ -1889,7 +1884,7 @@ xlog_recover_do_inode_buffer(
}
ASSERT(item->ri_buf[item_index].i_addr != NULL);
- ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+ ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
/*
@@ -1955,9 +1950,9 @@ xlog_recover_do_reg_buffer(
nbits = xfs_contig_bits(data_map, map_size, bit);
ASSERT(nbits > 0);
ASSERT(item->ri_buf[i].i_addr != NULL);
- ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+ ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
ASSERT(XFS_BUF_COUNT(bp) >=
- ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+ ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
/*
* Do a sanity check if this is a dquot buffer. Just checking
@@ -1966,7 +1961,7 @@ xlog_recover_do_reg_buffer(
*/
error = 0;
if (buf_f->blf_flags &
- (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+ (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
if (item->ri_buf[i].i_addr == NULL) {
cmn_err(CE_ALERT,
"XFS: NULL dquot in %s.", __func__);
@@ -1987,9 +1982,9 @@ xlog_recover_do_reg_buffer(
}
memcpy(xfs_buf_offset(bp,
- (uint)bit << XFS_BLI_SHIFT), /* dest */
+ (uint)bit << XFS_BLF_SHIFT), /* dest */
item->ri_buf[i].i_addr, /* source */
- nbits<<XFS_BLI_SHIFT); /* length */
+ nbits<<XFS_BLF_SHIFT); /* length */
next:
i++;
bit += nbits;
@@ -2148,11 +2143,11 @@ xlog_recover_do_dquot_buffer(
}
type = 0;
- if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+ if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
type |= XFS_DQ_USER;
- if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
+ if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
type |= XFS_DQ_PROJ;
- if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+ if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
type |= XFS_DQ_GROUP;
/*
* This type of quotas was turned off, so ignore this buffer
@@ -2173,7 +2168,7 @@ xlog_recover_do_dquot_buffer(
* here which overlaps that may be stale.
*
* When meta-data buffers are freed at run time we log a buffer item
- * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
* of the buffer in the log should not be replayed at recovery time.
* This is so that if the blocks covered by the buffer are reused for
* file data before we crash we don't end up replaying old, freed
@@ -2207,7 +2202,7 @@ xlog_recover_do_buffer_trans(
if (pass == XLOG_RECOVER_PASS1) {
/*
* In this pass we're only looking for buf items
- * with the XFS_BLI_CANCEL bit set.
+ * with the XFS_BLF_CANCEL bit set.
*/
xlog_recover_do_buffer_pass1(log, buf_f);
return 0;
@@ -2244,7 +2239,7 @@ xlog_recover_do_buffer_trans(
mp = log->l_mp;
buf_flags = XBF_LOCK;
- if (!(flags & XFS_BLI_INODE_BUF))
+ if (!(flags & XFS_BLF_INODE_BUF))
buf_flags |= XBF_MAPPED;
bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2257,10 +2252,10 @@ xlog_recover_do_buffer_trans(
}
error = 0;
- if (flags & XFS_BLI_INODE_BUF) {
+ if (flags & XFS_BLF_INODE_BUF) {
error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
} else if (flags &
- (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+ (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
} else {
xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d74920725..1c55ccbb379 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
#define XLOG_RHASH(tid) \
((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
-#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d7bf38c8cd1..d59f4e8bedc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -268,10 +268,10 @@ xfs_sb_validate_fsb_count(
#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
- return E2BIG;
+ return EFBIG;
#else /* Limited by UINT_MAX of sectors */
if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
- return E2BIG;
+ return EFBIG;
#endif
return 0;
}
@@ -393,7 +393,7 @@ xfs_mount_validate_sb(
xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
xfs_fs_mount_cmn_err(flags,
"file system too large to be mounted on this system.");
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
}
if (unlikely(sbp->sb_inprogress)) {
@@ -413,17 +413,6 @@ xfs_mount_validate_sb(
return 0;
}
-STATIC void
-xfs_initialize_perag_icache(
- xfs_perag_t *pag)
-{
- if (!pag->pag_ici_init) {
- rwlock_init(&pag->pag_ici_lock);
- INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
- pag->pag_ici_init = 1;
- }
-}
-
int
xfs_initialize_perag(
xfs_mount_t *mp,
@@ -436,13 +425,8 @@ xfs_initialize_perag(
xfs_agino_t agino;
xfs_ino_t ino;
xfs_sb_t *sbp = &mp->m_sb;
- xfs_ino_t max_inum = XFS_MAXINUMBER_32;
int error = -ENOMEM;
- /* Check to see if the filesystem can overflow 32 bit inodes */
- agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
- ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
-
/*
* Walk the current per-ag tree so we don't try to initialise AGs
* that already exist (growfs case). Allocate and insert all the
@@ -456,11 +440,18 @@ xfs_initialize_perag(
}
if (!first_initialised)
first_initialised = index;
+
pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
if (!pag)
goto out_unwind;
+ pag->pag_agno = index;
+ pag->pag_mount = mp;
+ rwlock_init(&pag->pag_ici_lock);
+ INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+
if (radix_tree_preload(GFP_NOFS))
goto out_unwind;
+
spin_lock(&mp->m_perag_lock);
if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
BUG();
@@ -469,25 +460,26 @@ xfs_initialize_perag(
error = -EEXIST;
goto out_unwind;
}
- pag->pag_agno = index;
- pag->pag_mount = mp;
spin_unlock(&mp->m_perag_lock);
radix_tree_preload_end();
}
- /* Clear the mount flag if no inode can overflow 32 bits
- * on this filesystem, or if specifically requested..
+ /*
+ * If we mount with the inode64 option, or no inode overflows
+ * the legacy 32-bit address space clear the inode32 option.
*/
- if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) {
+ agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+ ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+ if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
mp->m_flags |= XFS_MOUNT_32BITINODES;
- } else {
+ else
mp->m_flags &= ~XFS_MOUNT_32BITINODES;
- }
- /* If we can overflow then setup the ag headers accordingly */
if (mp->m_flags & XFS_MOUNT_32BITINODES) {
- /* Calculate how much should be reserved for inodes to
- * meet the max inode percentage.
+ /*
+ * Calculate how much should be reserved for inodes to meet
+ * the max inode percentage.
*/
if (mp->m_maxicount) {
__uint64_t icount;
@@ -500,30 +492,28 @@ xfs_initialize_perag(
} else {
max_metadata = agcount;
}
+
for (index = 0; index < agcount; index++) {
ino = XFS_AGINO_TO_INO(mp, index, agino);
- if (ino > max_inum) {
+ if (ino > XFS_MAXINUMBER_32) {
index++;
break;
}
- /* This ag is preferred for inodes */
pag = xfs_perag_get(mp, index);
pag->pagi_inodeok = 1;
if (index < max_metadata)
pag->pagf_metadata = 1;
- xfs_initialize_perag_icache(pag);
xfs_perag_put(pag);
}
} else {
- /* Setup default behavior for smaller filesystems */
for (index = 0; index < agcount; index++) {
pag = xfs_perag_get(mp, index);
pag->pagi_inodeok = 1;
- xfs_initialize_perag_icache(pag);
xfs_perag_put(pag);
}
}
+
if (maxagi)
*maxagi = index;
return 0;
@@ -1009,7 +999,7 @@ xfs_check_sizes(xfs_mount_t *mp)
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
cmn_err(CE_WARN, "XFS: size check 1 failed");
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
}
error = xfs_read_buf(mp, mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1009,7 @@ xfs_check_sizes(xfs_mount_t *mp)
} else {
cmn_err(CE_WARN, "XFS: size check 2 failed");
if (error == ENOSPC)
- error = XFS_ERROR(E2BIG);
+ error = XFS_ERROR(EFBIG);
return error;
}
@@ -1027,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp)
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
cmn_err(CE_WARN, "XFS: size check 3 failed");
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
}
error = xfs_read_buf(mp, mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1027,7 @@ xfs_check_sizes(xfs_mount_t *mp)
} else {
cmn_err(CE_WARN, "XFS: size check 3 failed");
if (error == ENOSPC)
- error = XFS_ERROR(E2BIG);
+ error = XFS_ERROR(EFBIG);
return error;
}
}
@@ -1254,7 +1244,7 @@ xfs_mountfs(
* Allocate and initialize the per-ag data.
*/
spin_lock_init(&mp->m_perag_lock);
- INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
if (error) {
cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7e..1d2c7eed4ed 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
must be synchronous except
for space allocations */
+#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d5..16445518506 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2247,7 +2247,7 @@ xfs_rtmount_init(
cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
(unsigned long long) XFS_BB_TO_FSB(mp, d),
(unsigned long long) mp->m_sb.sb_rblocks);
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
}
error = xfs_read_buf(mp, mp->m_rtdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2256,7 @@ xfs_rtmount_init(
cmn_err(CE_WARN,
"XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
if (error == ENOSPC)
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
return error;
}
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a0..ff614c29b44 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
-# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+static inline int /* error */
+xfs_rtmount_init(
+ xfs_mount_t *mp) /* file system mount structure */
+{
+ if (mp->m_sb.sb_rblocks == 0)
+ return 0;
+
+ cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+ return ENOSYS;
+}
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
# define xfs_rtunmount_inodes(m)
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be578ecb4af..28547dfce03 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,137 +44,493 @@
#include "xfs_trans_priv.h"
#include "xfs_trans_space.h"
#include "xfs_inode_item.h"
+#include "xfs_trace.h"
kmem_zone_t *xfs_trans_zone;
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate. Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note: Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual. In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time. This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction. See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
/*
- * Reservation functions here avoid a huge stack in xfs_trans_init
- * due to register overflow from temporaries in the calculations.
+ * In a write transaction we can allocate a maximum of 2
+ * extents. This gives:
+ * the inode getting the new extents: inode size
+ * the inode's bmap btree: max depth * block size
+ * the agfs of the ags from which the extents are allocated: 2 * sector
+ * the superblock free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ * the agfs of the ags containing the blocks: 2 * sector size
+ * the agfls of the ags containing the blocks: 2 * sector size
+ * the super block free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
*/
STATIC uint
-xfs_calc_write_reservation(xfs_mount_t *mp)
+xfs_calc_write_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+ (2 * mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
}
+/*
+ * In truncating a file we free up to two extents at once. We can modify:
+ * the inode being truncated: inode size
+ * the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_itruncate_reservation(xfs_mount_t *mp)
+xfs_calc_itruncate_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+ 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+ (4 * mp->m_sb.sb_sectsize +
+ 4 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 4) +
+ 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+ 128 * 5 +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
}
+/*
+ * In renaming a files we can modify:
+ * the four inodes involved: 4 * inode size
+ * the two directory btrees: 2 * (max depth + v2) * dir block size
+ * the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ * of bmap blocks) giving:
+ * the agf for the ags in which the blocks live: 3 * sector size
+ * the agfl for the ags in which the blocks live: 3 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_rename_reservation(xfs_mount_t *mp)
+xfs_calc_rename_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((4 * mp->m_sb.sb_inodesize +
+ 2 * XFS_DIROP_LOG_RES(mp) +
+ 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+ (3 * mp->m_sb.sb_sectsize +
+ 3 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 3) +
+ 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
}
+/*
+ * For creating a link to an inode:
+ * the parent directory inode: inode size
+ * the linked inode: inode size
+ * the directory btree could split: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ * the agf for the ag in which the blocks live: sector size
+ * the agfl for the ag in which the blocks live: sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_link_reservation(xfs_mount_t *mp)
+xfs_calc_link_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ XFS_DIROP_LOG_RES(mp) +
+ 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+ (mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
}
+/*
+ * For removing a directory entry we can modify:
+ * the parent directory inode: inode size
+ * the removed inode: inode size
+ * the directory btree could join: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_remove_reservation(xfs_mount_t *mp)
+xfs_calc_remove_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ XFS_DIROP_LOG_RES(mp) +
+ 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+ (2 * mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
}
+/*
+ * For symlink we can modify:
+ * the parent directory inode: inode size
+ * the new inode: inode size
+ * the inode btree entry: 1 block
+ * the directory btree: (max depth + v2) * dir block size
+ * the directory inode's bmap btree: (max depth + v2) * block size
+ * the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ * the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_symlink_reservation(xfs_mount_t *mp)
+xfs_calc_symlink_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, 1) +
+ XFS_DIROP_LOG_RES(mp) +
+ 1024 +
+ 128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+ (2 * mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+ XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
}
+/*
+ * For create we can modify:
+ * the parent directory inode: inode size
+ * the new inode: inode size
+ * the inode btree entry: block size
+ * the superblock for the nlink flag: sector size
+ * the directory btree: (max depth + v2) * dir block size
+ * the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ * the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the superblock for the nlink flag: sector size
+ * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_create_reservation(xfs_mount_t *mp)
+xfs_calc_create_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, 1) +
+ XFS_DIROP_LOG_RES(mp) +
+ 128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+ (3 * mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+ XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
}
+/*
+ * Making a new directory is the same as creating a new file.
+ */
STATIC uint
-xfs_calc_mkdir_reservation(xfs_mount_t *mp)
+xfs_calc_mkdir_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return xfs_calc_create_reservation(mp);
}
+/*
+ * In freeing an inode we can modify:
+ * the inode being freed: inode size
+ * the super block free inode counter: sector size
+ * the agi hash list and counters: sector size
+ * the inode btree entry: block size
+ * the on disk inode before ours in the agi hash list: inode cluster size
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_ifree_reservation(xfs_mount_t *mp)
+xfs_calc_ifree_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, 1) +
+ MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+ XFS_INODE_CLUSTER_SIZE(mp)) +
+ 128 * 5 +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1));
}
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
STATIC uint
-xfs_calc_ichange_reservation(xfs_mount_t *mp)
+xfs_calc_ichange_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ 512;
+
}
+/*
+ * Growing the data section of the filesystem.
+ * superblock
+ * agi and agf
+ * allocation btrees
+ */
STATIC uint
-xfs_calc_growdata_reservation(xfs_mount_t *mp)
+xfs_calc_growdata_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_GROWDATA_LOG_RES(mp);
+ return mp->m_sb.sb_sectsize * 3 +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
}
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ * superblock: sector size
+ * agf of the ag from which the extent is allocated: sector size
+ * bmap btree for bitmap/summary inode: max depth * blocksize
+ * bitmap/summary inode: inode size
+ * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
STATIC uint
-xfs_calc_growrtalloc_reservation(xfs_mount_t *mp)
+xfs_calc_growrtalloc_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+ return 2 * mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+ mp->m_sb.sb_inodesize +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1));
}
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ * one bitmap/summary block: blocksize
+ */
STATIC uint
-xfs_calc_growrtzero_reservation(xfs_mount_t *mp)
+xfs_calc_growrtzero_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_GROWRTZERO_LOG_RES(mp);
+ return mp->m_sb.sb_blocksize + 128;
}
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ * superblock: sector size
+ * bitmap inode: inode size
+ * summary inode: inode size
+ * one bitmap block: blocksize
+ * summary blocks: new summary size
+ */
STATIC uint
-xfs_calc_growrtfree_reservation(xfs_mount_t *mp)
+xfs_calc_growrtfree_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_GROWRTFREE_LOG_RES(mp);
+ return mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_blocksize +
+ mp->m_rsumsize +
+ 128 * 5;
}
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ * inode
+ */
STATIC uint
-xfs_calc_swrite_reservation(xfs_mount_t *mp)
+xfs_calc_swrite_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_SWRITE_LOG_RES(mp);
+ return mp->m_sb.sb_inodesize + 128;
}
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ * inode
+ */
STATIC uint
xfs_calc_writeid_reservation(xfs_mount_t *mp)
{
- return XFS_CALC_WRITEID_LOG_RES(mp);
+ return mp->m_sb.sb_inodesize + 128;
}
+/*
+ * Converting the inode from non-attributed to attributed.
+ * the inode being converted: inode size
+ * agf block and superblock (for block allocation)
+ * the new block (directory sized)
+ * bmap blocks for the new directory block
+ * allocation btrees
+ */
STATIC uint
-xfs_calc_addafork_reservation(xfs_mount_t *mp)
+xfs_calc_addafork_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize * 2 +
+ mp->m_dirblksize +
+ XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1));
}
+/*
+ * Removing the attribute fork of a file
+ * the inode being truncated: inode size
+ * the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_attrinval_reservation(xfs_mount_t *mp)
+xfs_calc_attrinval_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ATTRINVAL_LOG_RES(mp);
+ return MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+ 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+ (4 * mp->m_sb.sb_sectsize +
+ 4 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 4) +
+ 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
}
+/*
+ * Setting an attribute.
+ * the inode getting the attribute
+ * the superblock for allocations
+ * the agfs extents are allocated from
+ * the attribute btree * max depth
+ * the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
STATIC uint
-xfs_calc_attrset_reservation(xfs_mount_t *mp)
+xfs_calc_attrset_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+ 128 * (2 + XFS_DA_NODE_MAXDEPTH);
}
+/*
+ * Removing an attribute.
+ * the inode: inode size
+ * the attribute btree could join: max depth * block size
+ * the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_attrrm_reservation(xfs_mount_t *mp)
+xfs_calc_attrrm_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+ 128 * (1 + XFS_DA_NODE_MAXDEPTH +
+ XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+ (2 * mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
}
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
STATIC uint
-xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
+xfs_calc_clear_agi_bucket_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+ return mp->m_sb.sb_sectsize + 128;
}
/*
@@ -183,11 +539,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
*/
void
xfs_trans_init(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_trans_reservations_t *resp;
+ struct xfs_trans_reservations *resp = &mp->m_reservations;
- resp = &(mp->m_reservations);
resp->tr_write = xfs_calc_write_reservation(mp);
resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
resp->tr_rename = xfs_calc_rename_reservation(mp);
@@ -243,9 +598,8 @@ _xfs_trans_alloc(
tp->t_type = type;
tp->t_mountp = mp;
tp->t_items_free = XFS_LIC_NUM_SLOTS;
- tp->t_busy_free = XFS_LBC_NUM_SLOTS;
xfs_lic_init(&(tp->t_items));
- XFS_LBC_INIT(&(tp->t_busy));
+ INIT_LIST_HEAD(&tp->t_busy);
return tp;
}
@@ -255,8 +609,13 @@ _xfs_trans_alloc(
*/
STATIC void
xfs_trans_free(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
+ struct xfs_busy_extent *busyp, *n;
+
+ list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
+ xfs_alloc_busy_clear(tp->t_mountp, busyp);
+
atomic_dec(&tp->t_mountp->m_active_trans);
xfs_trans_free_dqinfo(tp);
kmem_zone_free(xfs_trans_zone, tp);
@@ -285,9 +644,8 @@ xfs_trans_dup(
ntp->t_type = tp->t_type;
ntp->t_mountp = tp->t_mountp;
ntp->t_items_free = XFS_LIC_NUM_SLOTS;
- ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
xfs_lic_init(&(ntp->t_items));
- XFS_LBC_INIT(&(ntp->t_busy));
+ INIT_LIST_HEAD(&ntp->t_busy);
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(tp->t_ticket != NULL);
@@ -423,7 +781,6 @@ undo_blocks:
return error;
}
-
/*
* Record the indicated change to the given field for application
* to the file system's superblock when the transaction commits.
@@ -652,7 +1009,7 @@ xfs_trans_apply_sb_deltas(
* XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
* still need to update the incore superblock with the changes.
*/
-STATIC void
+void
xfs_trans_unreserve_and_mod_sb(
xfs_trans_t *tp)
{
@@ -880,7 +1237,7 @@ xfs_trans_fill_vecs(
* they could be immediately flushed and we'd have to race with the flusher
* trying to pull the item from the AIL as we add it.
*/
-static void
+void
xfs_trans_item_committed(
struct xfs_log_item *lip,
xfs_lsn_t commit_lsn,
@@ -930,26 +1287,6 @@ xfs_trans_item_committed(
IOP_UNPIN(lip);
}
-/* Clear all the per-AG busy list items listed in this transaction */
-static void
-xfs_trans_clear_busy_extents(
- struct xfs_trans *tp)
-{
- xfs_log_busy_chunk_t *lbcp;
- xfs_log_busy_slot_t *lbsp;
- int i;
-
- for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
- i = 0;
- for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
- if (XFS_LBC_ISFREE(lbcp, i))
- continue;
- xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
- }
- }
- xfs_trans_free_busy(tp);
-}
-
/*
* This is typically called by the LM when a transaction has been fully
* committed to disk. It needs to unpin the items which have
@@ -984,7 +1321,6 @@ xfs_trans_committed(
kmem_free(licp);
}
- xfs_trans_clear_busy_extents(tp);
xfs_trans_free(tp);
}
@@ -1012,8 +1348,7 @@ xfs_trans_uncommit(
xfs_trans_unreserve_and_mod_sb(tp);
xfs_trans_unreserve_and_mod_dquots(tp);
- xfs_trans_free_items(tp, flags);
- xfs_trans_free_busy(tp);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
xfs_trans_free(tp);
}
@@ -1075,6 +1410,8 @@ xfs_trans_commit_iclog(
*commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
tp->t_commit_lsn = *commit_lsn;
+ trace_xfs_trans_commit_lsn(tp);
+
if (nvec > XFS_TRANS_LOGVEC_COUNT)
kmem_free(log_vector);
@@ -1161,6 +1498,93 @@ xfs_trans_commit_iclog(
return xfs_log_release_iclog(mp, commit_iclog);
}
+/*
+ * Walk the log items and allocate log vector structures for
+ * each item large enough to fit all the vectors they require.
+ * Note that this format differs from the old log vector format in
+ * that there is no transaction header in these log vectors.
+ */
+STATIC struct xfs_log_vec *
+xfs_trans_alloc_log_vecs(
+ xfs_trans_t *tp)
+{
+ xfs_log_item_desc_t *lidp;
+ struct xfs_log_vec *lv = NULL;
+ struct xfs_log_vec *ret_lv = NULL;
+
+ lidp = xfs_trans_first_item(tp);
+
+ /* Bail out if we didn't find a log item. */
+ if (!lidp) {
+ ASSERT(0);
+ return NULL;
+ }
+
+ while (lidp != NULL) {
+ struct xfs_log_vec *new_lv;
+
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+ lidp = xfs_trans_next_item(tp, lidp);
+ continue;
+ }
+
+ /* Skip items that do not have any vectors for writing */
+ lidp->lid_size = IOP_SIZE(lidp->lid_item);
+ if (!lidp->lid_size) {
+ lidp = xfs_trans_next_item(tp, lidp);
+ continue;
+ }
+
+ new_lv = kmem_zalloc(sizeof(*new_lv) +
+ lidp->lid_size * sizeof(struct xfs_log_iovec),
+ KM_SLEEP);
+
+ /* The allocated iovec region lies beyond the log vector. */
+ new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+ new_lv->lv_niovecs = lidp->lid_size;
+ new_lv->lv_item = lidp->lid_item;
+ if (!ret_lv)
+ ret_lv = new_lv;
+ else
+ lv->lv_next = new_lv;
+ lv = new_lv;
+ lidp = xfs_trans_next_item(tp, lidp);
+ }
+
+ return ret_lv;
+}
+
+static int
+xfs_trans_commit_cil(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_lsn_t *commit_lsn,
+ int flags)
+{
+ struct xfs_log_vec *log_vector;
+ int error;
+
+ /*
+ * Get each log item to allocate a vector structure for
+ * the log item to to pass to the log write code. The
+ * CIL commit code will format the vector and save it away.
+ */
+ log_vector = xfs_trans_alloc_log_vecs(tp);
+ if (!log_vector)
+ return ENOMEM;
+
+ error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+ if (error)
+ return error;
+
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+ /* xfs_trans_free_items() unlocks them first */
+ xfs_trans_free_items(tp, *commit_lsn, 0);
+ xfs_trans_free(tp);
+ return 0;
+}
/*
* xfs_trans_commit
@@ -1221,7 +1645,11 @@ _xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+ if (mp->m_flags & XFS_MOUNT_DELAYLOG)
+ error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
+ else
+ error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+
if (error == ENOMEM) {
xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
error = XFS_ERROR(EIO);
@@ -1259,8 +1687,7 @@ out_unreserve:
error = XFS_ERROR(EIO);
}
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0);
- xfs_trans_free_busy(tp);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
xfs_trans_free(tp);
XFS_STATS_INC(xs_trans_empty);
@@ -1338,8 +1765,7 @@ xfs_trans_cancel(
/* mark this thread as no longer being in a transaction */
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free_items(tp, flags);
- xfs_trans_free_busy(tp);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
xfs_trans_free(tp);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c62beee0921..e639e8e9a2a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -106,7 +106,8 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_GROWFSRT_FREE 39
#define XFS_TRANS_SWAPEXT 40
#define XFS_TRANS_SB_COUNT 41
-#define XFS_TRANS_TYPE_MAX 41
+#define XFS_TRANS_CHECKPOINT 42
+#define XFS_TRANS_TYPE_MAX 42
/* new transaction types need to be reflected in xfs_logprint(8) */
#define XFS_TRANS_TYPES \
@@ -148,6 +149,7 @@ typedef struct xfs_trans_header {
{ XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
{ XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
{ XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
+ { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
{ XFS_TRANS_DUMMY1, "DUMMY1" }, \
{ XFS_TRANS_DUMMY2, "DUMMY2" }, \
{ XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
@@ -298,24 +300,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
/*
- * Various log reservation values.
- * These are based on the size of the file system block
- * because that is what most transactions manipulate.
- * Each adds in an additional 128 bytes per item logged to
- * try to account for the overhead of the transaction mechanism.
- *
- * Note:
- * Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish()
- * call. This is because the number in the worst case is quite high
- * and quite unusual. In order to fix this we need to change
- * xfs_bmap_finish() to free extents in only a single AG at a time.
- * This will require changes to the EFI code as well, however, so that
- * the EFI for the extents not freed is logged again in each transaction.
- * See bug 261917.
- */
-
-/*
* Per-extent log reservation for the allocation btree changes
* involved in freeing or allocating an extent.
* 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -339,429 +323,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents. This gives:
- * the inode getting the new extents: inode size
- * the inode's bmap btree: max depth * block size
- * the agfs of the ags from which the extents are allocated: 2 * sector
- * the superblock free block counter: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- * the agfs of the ags containing the blocks: 2 * sector size
- * the agfls of the ags containing the blocks: 2 * sector size
- * the super block free block counter: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_WRITE_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
- (2 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 2) + \
- (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
- ((2 * (mp)->m_sb.sb_sectsize) + \
- (2 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 2) + \
- (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write)
-
-/*
- * In truncating a file we free up to two extents at once. We can modify:
- * the inode being truncated: inode size
- * the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- * the agf for each of the ags: 4 * sector size
- * the agfl for each of the ags: 4 * sector size
- * the super block to reflect the freed blocks: sector size
- * worst case split in allocation btrees per extent assuming 4 extents:
- * 4 exts * 2 trees * (2 * max depth - 1) * block size
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
- (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
- ((4 * (mp)->m_sb.sb_sectsize) + \
- (4 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 4) + \
- (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
- (128 * 5) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
-
-/*
- * In renaming a files we can modify:
- * the four inodes involved: 4 * inode size
- * the two directory btrees: 2 * (max depth + v2) * dir block size
- * the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- * of bmap blocks) giving:
- * the agf for the ags in which the blocks live: 3 * sector size
- * the agfl for the ags in which the blocks live: 3 * sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_RENAME_LOG_RES(mp) \
- (MAX( \
- ((4 * (mp)->m_sb.sb_inodesize) + \
- (2 * XFS_DIROP_LOG_RES(mp)) + \
- (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
- ((3 * (mp)->m_sb.sb_sectsize) + \
- (3 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 3) + \
- (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
-
#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
-
-/*
- * For creating a link to an inode:
- * the parent directory inode: inode size
- * the linked inode: inode size
- * the directory btree could split: (max depth + v2) * dir block size
- * the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- * the agf for the ag in which the blocks live: sector size
- * the agfl for the ag in which the blocks live: sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_LINK_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_inodesize + \
- XFS_DIROP_LOG_RES(mp) + \
- (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
- ((mp)->m_sb.sb_sectsize + \
- (mp)->m_sb.sb_sectsize + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
-
-/*
- * For removing a directory entry we can modify:
- * the parent directory inode: inode size
- * the removed inode: inode size
- * the directory btree could join: (max depth + v2) * dir block size
- * the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- * the agf for the ag in which the blocks live: 2 * sector size
- * the agfl for the ag in which the blocks live: 2 * sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_REMOVE_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_inodesize + \
- XFS_DIROP_LOG_RES(mp) + \
- (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
- ((2 * (mp)->m_sb.sb_sectsize) + \
- (2 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 2) + \
- (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
-
#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
-
-/*
- * For symlink we can modify:
- * the parent directory inode: inode size
- * the new inode: inode size
- * the inode btree entry: 1 block
- * the directory btree: (max depth + v2) * dir block size
- * the directory inode's bmap btree: (max depth + v2) * block size
- * the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
- * the agi and agf of the ag getting the new inodes: 2 * sectorsize
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_SYMLINK_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B(mp, 1) + \
- XFS_DIROP_LOG_RES(mp) + \
- 1024 + \
- (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
- (2 * (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
- XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
-
-/*
- * For create we can modify:
- * the parent directory inode: inode size
- * the new inode: inode size
- * the inode btree entry: block size
- * the superblock for the nlink flag: sector size
- * the directory btree: (max depth + v2) * dir block size
- * the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
- * the agi and agf of the ag getting the new inodes: 2 * sectorsize
- * the superblock for the nlink flag: sector size
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_CREATE_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B(mp, 1) + \
- XFS_DIROP_LOG_RES(mp) + \
- (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
- (3 * (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
- XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
-
-/*
- * Making a new directory is the same as creating a new file.
- */
-#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp)
-
#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
-
-/*
- * In freeing an inode we can modify:
- * the inode being freed: inode size
- * the super block free inode counter: sector size
- * the agi hash list and counters: sector size
- * the inode btree entry: block size
- * the on disk inode before ours in the agi hash list: inode cluster size
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_IFREE_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize + \
- (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), 1) + \
- MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
- (128 * 5) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
-
#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
-
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize + 512)
-
#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
-
-/*
- * Growing the data section of the filesystem.
- * superblock
- * agi and agf
- * allocation btrees
- */
-#define XFS_CALC_GROWDATA_LOG_RES(mp) \
- ((mp)->m_sb.sb_sectsize * 3 + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
-
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- * superblock: sector size
- * agf of the ag from which the extent is allocated: sector size
- * bmap btree for bitmap/summary inode: max depth * blocksize
- * bitmap/summary inode: inode size
- * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
- (2 * (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
- (mp)->m_sb.sb_inodesize + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * \
- (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
-
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- * one bitmap/summary block: blocksize
- */
-#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
- ((mp)->m_sb.sb_blocksize + 128)
-
#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
-
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- * superblock: sector size
- * bitmap inode: inode size
- * summary inode: inode size
- * one bitmap block: blocksize
- * summary blocks: new summary size
- */
-#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
- ((mp)->m_sb.sb_sectsize + \
- 2 * (mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_blocksize + \
- (mp)->m_rsumsize + \
- (128 * 5))
-
#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
-
-/*
- * Logging the inode modification timestamp on a synchronous write.
- * inode
- */
-#define XFS_CALC_SWRITE_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + 128)
-
#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-
/*
* Logging the inode timestamps on an fsync -- same as SWRITE
* as long as SWRITE logs the entire inode core
*/
#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- * inode
- */
-#define XFS_CALC_WRITEID_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + 128)
-
#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-
-/*
- * Converting the inode from non-attributed to attributed.
- * the inode being converted: inode size
- * agf block and superblock (for block allocation)
- * the new block (directory sized)
- * bmap blocks for the new directory block
- * allocation btrees
- */
-#define XFS_CALC_ADDAFORK_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize * 2 + \
- (mp)->m_dirblksize + \
- XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
-
-/*
- * Removing the attribute fork of a file
- * the inode being truncated: inode size
- * the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- * the agf for each of the ags: 4 * sector size
- * the agfl for each of the ags: 4 * sector size
- * the super block to reflect the freed blocks: sector size
- * worst case split in allocation btrees per extent assuming 4 extents:
- * 4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
- (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
- ((4 * (mp)->m_sb.sb_sectsize) + \
- (4 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 4) + \
- (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
-
#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
-
-/*
- * Setting an attribute.
- * the inode getting the attribute
- * the superblock for allocations
- * the agfs extents are allocated from
- * the attribute btree * max depth
- * the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
- */
-#define XFS_CALC_ATTRSET_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
- (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
-
#define XFS_ATTRSET_LOG_RES(mp, ext) \
((mp)->m_reservations.tr_attrset + \
(ext * (mp)->m_sb.sb_sectsize) + \
(ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
(128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-
-/*
- * Removing an attribute.
- * the inode: inode size
- * the attribute btree could join: max depth * block size
- * the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- * the agf for the ag in which the blocks live: 2 * sector size
- * the agfl for the ag in which the blocks live: 2 * sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRRM_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
- (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
- ((2 * (mp)->m_sb.sb_sectsize) + \
- (2 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 2) + \
- (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
-
#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
-
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
- ((mp)->m_sb.sb_sectsize + 128)
-
#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
@@ -813,6 +404,7 @@ struct xfs_log_item_desc;
struct xfs_mount;
struct xfs_trans;
struct xfs_dquot_acct;
+struct xfs_busy_extent;
typedef struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
@@ -828,6 +420,11 @@ typedef struct xfs_log_item {
/* buffer item iodone */
/* callback func */
struct xfs_item_ops *li_ops; /* function list */
+
+ /* delayed logging */
+ struct list_head li_cil; /* CIL pointers */
+ struct xfs_log_vec *li_lv; /* active log vector */
+ xfs_lsn_t li_seq; /* CIL commit seq */
} xfs_log_item_t;
#define XFS_LI_IN_AIL 0x1
@@ -872,34 +469,6 @@ typedef struct xfs_item_ops {
#define XFS_ITEM_PUSHBUF 3
/*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction. The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-
-typedef struct xfs_log_busy_slot {
- xfs_agnumber_t lbc_ag;
- ushort lbc_idx; /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-
-#define XFS_LBC_NUM_SLOTS 31
-typedef struct xfs_log_busy_chunk {
- struct xfs_log_busy_chunk *lbc_next;
- uint lbc_free; /* free slots bitmask */
- ushort lbc_unused; /* first unused */
- xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-
-#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
-#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
-
-#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
-#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-
-/*
* This is the type of function which can be given to xfs_trans_callback()
* to be called upon the transaction's commit to disk.
*/
@@ -950,8 +519,7 @@ typedef struct xfs_trans {
unsigned int t_items_free; /* log item descs free */
xfs_log_item_chunk_t t_items; /* first log item desc chunk */
xfs_trans_header_t t_header; /* header for in-log trans */
- unsigned int t_busy_free; /* busy descs free */
- xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
+ struct list_head t_busy; /* list of busy extents */
unsigned long t_pflags; /* saved process flags state */
} xfs_trans_t;
@@ -1025,9 +593,6 @@ int _xfs_trans_commit(xfs_trans_t *,
void xfs_trans_cancel(xfs_trans_t *, int);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
-xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
- xfs_agnumber_t ag,
- xfs_extlen_t idx);
extern kmem_zone_t *xfs_trans_zone;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 9cd809025f3..63d81a22f4f 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -114,7 +114,7 @@ _xfs_trans_bjoin(
xfs_buf_item_init(bp, tp->t_mountp);
bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
if (reset_recur)
bip->bli_recur = 0;
@@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
/*
@@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_HOLD;
trace_xfs_trans_bhold(bip);
@@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT(bip->bli_flags & XFS_BLI_HOLD);
bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bip->bli_flags &= ~XFS_BLI_STALE;
ASSERT(XFS_BUF_ISSTALE(bp));
XFS_BUF_UNSTALE(bp);
- bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+ bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
}
lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -762,8 +762,8 @@ xfs_trans_binval(
ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
ASSERT(XFS_BUF_ISSTALE(bp));
ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
return;
@@ -774,7 +774,7 @@ xfs_trans_binval(
* in the buf log item. The STALE flag will be used in
* xfs_buf_item_unpin() to determine if it should clean up
* when the last reference to the buf item is given up.
- * We set the XFS_BLI_CANCEL flag in the buf log format structure
+ * We set the XFS_BLF_CANCEL flag in the buf log format structure
* and log the buf item. This will be used at recovery time
* to determine that copies of the buffer in the log before
* this should not be replayed.
@@ -792,9 +792,9 @@ xfs_trans_binval(
XFS_BUF_UNDELAYWRITE(bp);
XFS_BUF_STALE(bp);
bip->bli_flags |= XFS_BLI_STALE;
- bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
- bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
- bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+ bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+ bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
+ bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
memset((char *)(bip->bli_format.blf_data_map), 0,
(bip->bli_format.blf_map_size * sizeof(uint)));
lidp->lid_flags |= XFS_LID_DIRTY;
@@ -802,16 +802,16 @@ xfs_trans_binval(
}
/*
- * This call is used to indicate that the buffer contains on-disk
- * inodes which must be handled specially during recovery. They
- * require special handling because only the di_next_unlinked from
- * the inodes in the buffer should be recovered. The rest of the
- * data in the buffer is logged via the inodes themselves.
+ * This call is used to indicate that the buffer contains on-disk inodes which
+ * must be handled specially during recovery. They require special handling
+ * because only the di_next_unlinked from the inodes in the buffer should be
+ * recovered. The rest of the data in the buffer is logged via the inodes
+ * themselves.
*
- * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
- * format structure so that we'll know what to do at recovery time.
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
*/
-/* ARGSUSED */
void
xfs_trans_inode_buf(
xfs_trans_t *tp,
@@ -826,7 +826,7 @@ xfs_trans_inode_buf(
bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
- bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+ bip->bli_flags |= XFS_BLI_INODE_BUF;
}
/*
@@ -908,9 +908,9 @@ xfs_trans_dquot_buf(
ASSERT(XFS_BUF_ISBUSY(bp));
ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
- ASSERT(type == XFS_BLI_UDQUOT_BUF ||
- type == XFS_BLI_PDQUOT_BUF ||
- type == XFS_BLI_GDQUOT_BUF);
+ ASSERT(type == XFS_BLF_UDQUOT_BUF ||
+ type == XFS_BLF_PDQUOT_BUF ||
+ type == XFS_BLF_GDQUOT_BUF);
bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9ee..f11d37d06dc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
void
xfs_trans_free_items(
xfs_trans_t *tp,
+ xfs_lsn_t commit_lsn,
int flags)
{
xfs_log_item_chunk_t *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
* Special case the embedded chunk so we don't free it below.
*/
if (!xfs_lic_are_all_free(licp)) {
- (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+ (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
xfs_lic_all_free(licp);
licp->lic_unused = 0;
}
@@ -322,7 +323,7 @@ xfs_trans_free_items(
*/
while (licp != NULL) {
ASSERT(!xfs_lic_are_all_free(licp));
- (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+ (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
next_licp = licp->lic_next;
kmem_free(licp);
licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
return freed;
}
-
-
-/*
- * This is called to add the given busy item to the transaction's
- * list of busy items. It must find a free busy item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to busy descriptor used to point
- * to the new busy entry. The log busy entry will now point to its new
- * descriptor with its ???? field.
- */
-xfs_log_busy_slot_t *
-xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
-{
- xfs_log_busy_chunk_t *lbcp;
- xfs_log_busy_slot_t *lbsp;
- int i=0;
-
- /*
- * If there are no free descriptors, allocate a new chunk
- * of them and put it at the front of the chunk list.
- */
- if (tp->t_busy_free == 0) {
- lbcp = (xfs_log_busy_chunk_t*)
- kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
- ASSERT(lbcp != NULL);
- /*
- * Initialize the chunk, and then
- * claim the first slot in the newly allocated chunk.
- */
- XFS_LBC_INIT(lbcp);
- XFS_LBC_CLAIM(lbcp, 0);
- lbcp->lbc_unused = 1;
- lbsp = XFS_LBC_SLOT(lbcp, 0);
-
- /*
- * Link in the new chunk and update the free count.
- */
- lbcp->lbc_next = tp->t_busy.lbc_next;
- tp->t_busy.lbc_next = lbcp;
- tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
-
- /*
- * Initialize the descriptor and the generic portion
- * of the log item.
- *
- * Point the new slot at this item and return it.
- * Also point the log item at its currently active
- * descriptor and set the item's mount pointer.
- */
- lbsp->lbc_ag = ag;
- lbsp->lbc_idx = idx;
- return lbsp;
- }
-
- /*
- * Find the free descriptor. It is somewhere in the chunklist
- * of descriptors.
- */
- lbcp = &tp->t_busy;
- while (lbcp != NULL) {
- if (XFS_LBC_VACANCY(lbcp)) {
- if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
- i = lbcp->lbc_unused;
- break;
- } else {
- /* out-of-order vacancy */
- cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
- ASSERT(0);
- }
- }
- lbcp = lbcp->lbc_next;
- }
- ASSERT(lbcp != NULL);
- /*
- * If we find a free descriptor, claim it,
- * initialize it, and return it.
- */
- XFS_LBC_CLAIM(lbcp, i);
- if (lbcp->lbc_unused <= i) {
- lbcp->lbc_unused = i + 1;
- }
- lbsp = XFS_LBC_SLOT(lbcp, i);
- tp->t_busy_free--;
- lbsp->lbc_ag = ag;
- lbsp->lbc_idx = idx;
- return lbsp;
-}
-
-
-/*
- * xfs_trans_free_busy
- * Free all of the busy lists from a transaction
- */
-void
-xfs_trans_free_busy(xfs_trans_t *tp)
-{
- xfs_log_busy_chunk_t *lbcp;
- xfs_log_busy_chunk_t *lbcq;
-
- lbcp = tp->t_busy.lbc_next;
- while (lbcp != NULL) {
- lbcq = lbcp->lbc_next;
- kmem_free(lbcp);
- lbcp = lbcq;
- }
-
- XFS_LBC_INIT(&tp->t_busy);
- tp->t_busy.lbc_unused = 0;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad39743..c6e4f2c8de6 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
struct xfs_log_item_desc *);
-void xfs_trans_free_items(struct xfs_trans *, int);
-void xfs_trans_unlock_items(struct xfs_trans *,
- xfs_lsn_t);
-void xfs_trans_free_busy(xfs_trans_t *tp);
-xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
- xfs_agnumber_t ag,
- xfs_extlen_t idx);
+
+void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
+void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+ int flags);
+
+void xfs_trans_item_committed(struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn, int aborted);
+void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
/*
* AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d0..320775295e3 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
+typedef __uint32_t xlog_tid_t; /* transaction ID type */
+
/*
* These types are 64 bits on disk but are either 32 or 64 bits in memory.
* Disk based types:
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be0ea3..a06bd62504f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -267,7 +267,7 @@ xfs_setattr(
if (code) {
ASSERT(tp == NULL);
lock_flags &= ~XFS_ILOCK_EXCL;
- ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+ ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
goto error_return;
}
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);