From f661f1e0bf5002bdcc8b5810ad0a184a1841537f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:02 +1100 Subject: xfs: sync work is now only periodic log work The only thing the periodic sync work does now is flush the AIL and idle the log. These are really functions of the log code, so move the work to xfs_log.c and rename it appropriately. The only wart that this leaves behind is the xfssyncd_centisecs sysctl, otherwise the xfssyncd is dead. Clean up any comments that related to xfssyncd to reflect it's passing. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 10 deletions(-) (limited to 'fs/xfs/xfs_log.c') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7f4f9370d0e..efea12bfbd6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -34,6 +34,7 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" +#include "xfs_fsops.h" kmem_zone_t *xfs_log_ticket_zone; @@ -679,25 +680,29 @@ out: } /* - * Finish the recovery of the file system. This is separate from - * the xfs_log_mount() call, because it depends on the code in - * xfs_mountfs() to read in the root and real-time bitmap inodes - * between calling xfs_log_mount() and here. + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. * - * mp - ubiquitous xfs mount point structure + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. */ int xfs_log_mount_finish(xfs_mount_t *mp) { - int error; + int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { error = xlog_recover_finish(mp->m_log); - else { - error = 0; + if (!error) + xfs_log_work_queue(mp); + } else { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } + return error; } @@ -858,7 +863,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) void xfs_log_unmount(xfs_mount_t *mp) { - cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_log->l_work); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } @@ -1161,6 +1166,40 @@ done: } /* xlog_get_iclog_buffer_size */ +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) + xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + /* * This routine initializes some of the log structure for a given mount point. * Its primary purpose is to fill in enough, so recovery can occur. However, @@ -1195,6 +1234,7 @@ xlog_alloc_log( log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ @@ -3700,3 +3740,4 @@ xlog_iclogs_empty( } while (iclog != log->l_iclog); return 1; } + -- cgit v1.2.3-70-g09d2 From cf2931db2d189ce0583be7ae880d7e3f8c15f623 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:03 +1100 Subject: xfs: Bring some sanity to log unmounting When unmounting the filesystem, there are lots of operations that need to be done in a specific order, and they are spread across across a couple of functions. We have to drain the AIL before we write the unmount record, and we have to shut down the background log work before we do either of them. But this is all split haphazardly across xfs_unmountfs() and xfs_log_unmount(). Move all the AIL flushing and log manipulations to xfs_log_unmount() so that the responisbilities of each function is clear and the operations they perform obvious. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 29 ++++++++++++++++++++++++++--- fs/xfs/xfs_mount.c | 24 ------------------------ 2 files changed, 26 insertions(+), 27 deletions(-) (limited to 'fs/xfs/xfs_log.c') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index efea12bfbd6..e788f39721e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -855,15 +855,38 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Deallocate log structures for unmount/relocation. + * Shut down and release the AIL and Log. * - * We need to stop the aild from running before we destroy - * and deallocate the log as the aild references the log. + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. + * + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record, tear down the AIL and finally free the log. */ void xfs_log_unmount(xfs_mount_t *mp) { cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); + xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d9a31c6a0c5..c195ec85c72 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1459,13 +1459,6 @@ xfs_unmountfs( xfs_qm_unmount(mp); - /* - * Flush out the log synchronously so that we know for sure - * that nothing is pinned. This is important because bflush() - * will skip pinned buffers. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for @@ -1491,23 +1484,6 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - xfs_wait_buftarg(mp->m_ddev_targp); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); - - xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); -- cgit v1.2.3-70-g09d2 From 5889608df35783590251cfd440fa5d48f1855179 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:05 +1100 Subject: xfs: syncd workqueue is no more With the syncd functions moved to the log and/or removed, the syncd workqueue is the only remaining bit left. It is used by the log covering/ail pushing work, as well as by the inode reclaim work. Given how cheap workqueues are these days, give the log and inode reclaim work their own work queues and kill the syncd work queue. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_mount.h | 2 ++ fs/xfs/xfs_super.c | 38 ++++++++++++++++++-------------------- fs/xfs/xfs_sync.c | 20 +++++++++----------- fs/xfs/xfs_sync.h | 2 -- 5 files changed, 30 insertions(+), 34 deletions(-) (limited to 'fs/xfs/xfs_log.c') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e788f39721e..b6ce4d4b6de 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1193,7 +1193,7 @@ void xfs_log_work_queue( struct xfs_mount *mp) { - queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work, + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, msecs_to_jiffies(xfs_syncd_centisecs * 10)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a54b5aa498d..7c417b6b99e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -207,6 +207,8 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9468c687846..27d5a92e121 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -863,8 +863,23 @@ xfs_init_mount_workqueues( WQ_MEM_RECLAIM, 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; + + mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_reclaim_workqueue) + goto out_destroy_cil; + + mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_log_workqueue) + goto out_destroy_reclaim; + return 0; +out_destroy_reclaim: + destroy_workqueue(mp->m_reclaim_workqueue); +out_destroy_cil: + destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: @@ -877,6 +892,8 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); @@ -1391,10 +1408,6 @@ xfs_fs_fill_super( /* * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. - * For the same reason we must also initialise the syncd and register - * the inode cache shrinker so that inodes can be reclaimed during - * operations like a quotacheck that iterate all inodes in the - * filesystem. */ sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; @@ -1638,16 +1651,6 @@ xfs_destroy_zones(void) STATIC int __init xfs_init_workqueues(void) { - /* - * We never want to the same work item to run twice, reclaiming inodes - * or idling the log is not going to get any faster by multiple CPUs - * competing for ressources. Use the default large max_active value - * so that even lots of filesystems can perform these task in parallel. - */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); - if (!xfs_syncd_wq) - return -ENOMEM; - /* * The allocation workqueue can be used in memory reclaim situations * (writepage path), and parallelism is only limited by the number of @@ -1656,20 +1659,15 @@ xfs_init_workqueues(void) */ xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); if (!xfs_alloc_wq) - goto out_destroy_syncd; + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); - return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { destroy_workqueue(xfs_alloc_wq); - destroy_workqueue(xfs_syncd_wq); } STATIC int __init diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 6a2ada37916..15be21f074f 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -40,8 +40,6 @@ #include #include -struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ - /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between @@ -335,18 +333,18 @@ xfs_quiesce_attr( /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs syncd work default of 30s. Perhaps this should have it's own + * on the xfs periodic sync default of 30s. Perhaps this should have it's own * tunable, but that can be done if this method proves to be ineffective or too * aggressive. */ static void -xfs_syncd_queue_reclaim( +xfs_reclaim_work_queue( struct xfs_mount *mp) { rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } rcu_read_unlock(); @@ -367,7 +365,7 @@ xfs_reclaim_worker( struct xfs_mount, m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); } void @@ -388,7 +386,7 @@ __xfs_inode_set_reclaim_tag( spin_unlock(&ip->i_mount->m_perag_lock); /* schedule periodic background inode reclaim */ - xfs_syncd_queue_reclaim(ip->i_mount); + xfs_reclaim_work_queue(ip->i_mount); trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -646,9 +644,9 @@ out: /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. */ return 0; } @@ -804,7 +802,7 @@ xfs_reclaim_inodes_nr( int nr_to_scan) { /* kick background reclaimer and push the AIL */ - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 0018e846f0d..0beabea99e7 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -24,8 +24,6 @@ struct xfs_perag; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ -extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ - void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); -- cgit v1.2.3-70-g09d2 From c75921a72a7c4bb73a5e09a697a672722e5543f1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:08 +1100 Subject: xfs: xfs_quiesce_attr() should quiesce the log like unmount xfs_quiesce_attr() is supposed to leave the log empty with an unmount record written. Right now it does not wait for the AIL to be emptied before writing the unmount record, not does it wait for metadata IO completion, either. Fix it to use the same method and code as xfs_log_unmount(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 25 ++++++++++++++++++------- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_super.c | 41 ++++++++--------------------------------- 3 files changed, 27 insertions(+), 40 deletions(-) (limited to 'fs/xfs/xfs_log.c') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b6ce4d4b6de..d2d59692739 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -855,20 +855,17 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Shut down and release the AIL and Log. - * - * During unmount, we need to ensure we flush all the dirty metadata objects - * from the AIL so that the log is empty before we write the unmount record to - * the log. + * Empty the log for unmount/freeze. * * To do this, we first need to shut down the background log work so it is not * trying to cover the log as we clean up. We then need to unpin all objects in * the log so we can then flush them out. Once they have completed their IO and * run the callbacks removing themselves from the AIL, we can write the unmount - * record, tear down the AIL and finally free the log. + * record. */ void -xfs_log_unmount(xfs_mount_t *mp) +xfs_log_quiesce( + struct xfs_mount *mp) { cancel_delayed_work_sync(&mp->m_log->l_work); xfs_log_force(mp, XFS_LOG_SYNC); @@ -886,6 +883,20 @@ xfs_log_unmount(xfs_mount_t *mp) xfs_buf_unlock(mp->m_sb_bp); xfs_log_unmount_write(mp); +} + +/* + * Shut down and release the AIL and Log. + * + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. + */ +void +xfs_log_unmount( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 26ed7de352d..5caee96059d 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -183,6 +183,7 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); void xfs_log_worker(struct work_struct *work); +void xfs_log_quiesce(struct xfs_mount *mp); #endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3bafe66227f..fdedf2cabae 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1153,15 +1153,11 @@ xfs_restore_resvblks(struct xfs_mount *mp) * * This ensures that the metadata is written to their location on disk rather * than just existing in transactions in the log. This means after a quiesce - * there is no log replay required to write the inodes to disk (this is the main - * difference between a sync and a quiesce). + * there is no log replay required to write the inodes to disk - this is the + * primary difference between a sync and a quiesce. * - * This shoul deffectively mimic the code in xfs_unmountfs() and - * xfs_log_umount() but without tearing down any structures. - * XXX: bug fixes needed! - * - * Note: this stops background log work - the callers must ensure it is started - * again when appropriate. + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. */ void xfs_quiesce_attr( @@ -1180,39 +1176,18 @@ xfs_quiesce_attr( xfs_reclaim_inodes(mp, 0); xfs_reclaim_inodes(mp, SYNC_WAIT); - /* flush all pending changes from the AIL */ - xfs_ail_push_all_sync(mp->m_ail); - - /* stop background log work */ - cancel_delayed_work_sync(&mp->m_log->l_work); - - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - /* Push the superblock and write an unmount record */ error = xfs_log_sbcount(mp); if (error) xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. + * Just warn here till VFS can correctly support + * read-only remount without racing. */ - xfs_ail_push_all_sync(mp->m_ail); + WARN_ON(atomic_read(&mp->m_active_trans) != 0); - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); + xfs_log_quiesce(mp); } STATIC int -- cgit v1.2.3-70-g09d2 From d35e88faa3b0fc2cea35c3b2dca358b5cd09b45f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:12 +1100 Subject: xfs: only update the last_sync_lsn when a transaction completes The log write code stamps each iclog with the current tail LSN in the iclog header so that recovery knows where to find the tail of thelog once it has found the head. Normally this is taken from the first item on the AIL - the log item that corresponds to the oldest active item in the log. The problem is that when the AIL is empty, the tail lsn is dervied from the the l_last_sync_lsn, which is the LSN of the last iclog to be written to the log. In most cases this doesn't happen, because the AIL is rarely empty on an active filesystem. However, when it does, it opens up an interesting case when the transaction being committed to the iclog spans multiple iclogs. That is, the first iclog is stamped with the l_last_sync_lsn, and IO is issued. Then the next iclog is setup, the changes copied into the iclog (takes some time), and then the l_last_sync_lsn is stamped into the header and IO is issued. This is still the same transaction, so the tail lsn of both iclogs must be the same for log recovery to find the entire transaction to be able to replay it. The problem arises in that the iclog buffer IO completion updates the l_last_sync_lsn with it's own LSN. Therefore, If the first iclog completes it's IO before the second iclog is filled and has the tail lsn stamped in it, it will stamp the LSN of the first iclog into it's tail lsn field. If the system fails at this point, log recovery will not see a complete transaction, so the transaction will no be replayed. The fix is simple - the l_last_sync_lsn is updated when a iclog buffer IO completes, and this is incorrect. The l_last_sync_lsn shoul dbe updated when a transaction is completed by a iclog buffer IO. That is, only iclog buffers that have transaction commit callbacks attached to them should update the l_last_sync_lsn. This means that the last_sync_lsn will only move forward when a commit record it written, not in the middle of a large transaction that is rolling through multiple iclog buffers. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs/xfs/xfs_log.c') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d2d59692739..46b6986e39b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2461,14 +2461,27 @@ xlog_state_do_callback( /* - * update the last_sync_lsn before we drop the + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the * icloglock to ensure we are the only one that * can update it. */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); } else ioerrors++; -- cgit v1.2.3-70-g09d2 From c3f8fc73ac97b76a12692088ef9cace9af8422c0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:01 +1100 Subject: xfs: make buffer read verication an IO completion function Add a verifier function callback capability to the buffer read interfaces. This will be used by the callers to supply a function that verifies the contents of the buffer when it is read from disk. This patch does not provide callback functions, but simply modifies the interfaces to allow them to be called. The reason for adding this to the read interfaces is that it is very difficult to tell fom the outside is a buffer was just read from disk or whether we just pulled it out of cache. Supplying a callbck allows the buffer cache to use it's internal knowledge of the buffer to execute it only when the buffer is read from disk. It is intended that the verifier functions will mark the buffer with an EFSCORRUPTED error when verification fails. This allows the reading context to distinguish a verification error from an IO error, and potentially take further actions on the buffer (e.g. attempt repair) based on the error reported. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 4 ++-- fs/xfs/xfs_attr.c | 2 +- fs/xfs/xfs_btree.c | 21 ++++++++++++--------- fs/xfs/xfs_buf.c | 13 +++++++++---- fs/xfs/xfs_buf.h | 20 ++++++++++++-------- fs/xfs/xfs_da_btree.c | 4 ++-- fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_dquot.c | 4 ++-- fs/xfs/xfs_fsops.c | 4 ++-- fs/xfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log.c | 3 +-- fs/xfs/xfs_log_recover.c | 8 +++++--- fs/xfs/xfs_mount.c | 6 +++--- fs/xfs/xfs_qm.c | 5 +++-- fs/xfs/xfs_rtalloc.c | 6 +++--- fs/xfs/xfs_trans.h | 19 ++++++++----------- fs/xfs/xfs_trans_buf.c | 9 ++++++--- fs/xfs/xfs_vnodeops.c | 2 +- 19 files changed, 75 insertions(+), 61 deletions(-) (limited to 'fs/xfs/xfs_log.c') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 335206a9c69..21c3db08fd0 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -447,7 +447,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -2110,7 +2110,7 @@ xfs_read_agf( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp); + XFS_FSS_TO_BB(mp, 1), flags, bpp, NULL); if (error) return error; if (!*bpp) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 55bbe98e8f8..474c57a43cc 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1994,7 +1994,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp); + dblkno, blkcnt, 0, &bp, NULL); if (error) return(error); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 121ea99e615..7e791160092 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -266,9 +266,12 @@ xfs_btree_dup_cursor( for (i = 0; i < new->bc_nlevels; i++) { new->bc_ptrs[i] = cur->bc_ptrs[i]; new->bc_ra[i] = cur->bc_ra[i]; - if ((bp = cur->bc_bufs[i])) { - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + bp = cur->bc_bufs[i]; + if (bp) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, + 0, &bp, NULL); + if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; return error; @@ -624,10 +627,10 @@ xfs_btree_read_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, NULL); + if (error) return error; - } ASSERT(!xfs_buf_geterror(bp)); if (bp) xfs_buf_set_ref(bp, refval); @@ -650,7 +653,7 @@ xfs_btree_reada_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); } /* @@ -670,7 +673,7 @@ xfs_btree_reada_bufs( ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); } STATIC int @@ -1013,7 +1016,7 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp); + mp->m_bsize, flags, bpp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4b0b8dd1b7b..0298dd68479 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -654,7 +654,8 @@ xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + xfs_buf_iodone_t verify) { struct xfs_buf *bp; @@ -666,6 +667,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); + bp->b_iodone = verify; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -691,13 +693,14 @@ void xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, - int nmaps) + int nmaps, + xfs_buf_iodone_t verify) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify); } /* @@ -709,7 +712,8 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags) + int flags, + xfs_buf_iodone_t verify) { xfs_buf_t *bp; int error; @@ -723,6 +727,7 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; + bp->b_iodone = verify; xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7c0b6a0a155..677b1dc822f 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -100,6 +100,7 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + #define XB_PAGES 2 struct xfs_buf_map { @@ -159,7 +160,6 @@ typedef struct xfs_buf { #endif } xfs_buf_t; - /* Finding and Reading Buffers */ struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -196,9 +196,10 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, xfs_buf_flags_t flags); struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags); + xfs_buf_flags_t flags, xfs_buf_iodone_t verify); void xfs_buf_readahead_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps); + struct xfs_buf_map *map, int nmaps, + xfs_buf_iodone_t verify); static inline struct xfs_buf * xfs_buf_get( @@ -216,20 +217,22 @@ xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + xfs_buf_iodone_t verify) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags); + return xfs_buf_read_map(target, &map, 1, flags, verify); } static inline void xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks) + size_t numblks, + xfs_buf_iodone_t verify) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_readahead_map(target, &map, 1); + return xfs_buf_readahead_map(target, &map, 1, verify); } struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); @@ -239,7 +242,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t numblks, int flags); + xfs_daddr_t daddr, size_t numblks, int flags, + xfs_buf_iodone_t verify); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index c62e7e6ff50..4af8bad7068 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2161,7 +2161,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp); + mapp, nmap, 0, &bp, NULL); if (error) goto out_free; @@ -2237,7 +2237,7 @@ xfs_da_reada_buf( } mappedbno = mapp[0].bm_bn; - xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, NULL); out_free: if (mapp != &map) diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 0b296253bd0..bac86984e40 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -926,7 +926,7 @@ xfs_dir2_leaf_readbuf( XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + mip->ra_offset), - (int)BTOBB(mp->m_dirblksize)); + (int)BTOBB(mp->m_dirblksize), NULL); mip->ra_current = i; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bf27fcca484..e95f800333d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -439,7 +439,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp); + 0, &bp, NULL); if (error || !bp) return XFS_ERROR(error); } @@ -920,7 +920,7 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index bd9cb7f0b07..5440768ec41 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -168,7 +168,7 @@ xfs_growfs_data_private( dpct = pct - mp->m_sb.sb_imax_pct; bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; xfs_buf_relse(bp); @@ -439,7 +439,7 @@ xfs_growfs_data_private( if (agno < oagcount) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 37753e1c853..12e3dead439 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1490,7 +1490,7 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp); + XFS_FSS_TO_BB(mp, 1), 0, bpp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7449cb943ef..8d696301048 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -408,7 +408,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp); + (int)imap->im_len, buf_flags, &bp, NULL); if (error) { if (error != EAGAIN) { xfs_warn(mp, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46b6986e39b..1d6d2ee0849 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1129,8 +1129,7 @@ xlog_iodone(xfs_buf_t *bp) * with it being freed after writing the unmount record to the * log. */ - -} /* xlog_iodone */ +} /* * Return size of each in-core log record buffer. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3e06333d4bd..eb1e29ff0c7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2144,7 +2144,7 @@ xlog_recover_buffer_pass2( buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags); + buf_flags, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -2237,7 +2237,8 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + NULL); if (!bp) { error = ENOMEM; goto error; @@ -2548,7 +2549,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_len == 1); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); if (error) return error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 41ae7e1590f..d5402b0eb6a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -652,7 +652,7 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0); + BTOBB(sector_size), 0, NULL); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -1002,7 +1002,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1017,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 48c750b0e83..688f608b366 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -892,7 +892,7 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) break; @@ -979,7 +979,8 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - mp->m_quotainfo->qi_dqchunklen); + mp->m_quotainfo->qi_dqchunklen, + NULL); rablkno++; } } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index a69e0b4750a..b271ed939d7 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -870,7 +870,7 @@ xfs_rtbuf_get( ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp); + mp->m_bsize, 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -1873,7 +1873,7 @@ xfs_growfs_rt( */ bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; xfs_buf_relse(bp); @@ -2220,7 +2220,7 @@ xfs_rtmount_init( } bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "realtime device size check failed"); return EIO; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index db056544cbb..f02d4029650 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -464,10 +464,7 @@ xfs_trans_get_buf( int numblks, uint flags) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_trans_get_buf_map(tp, target, &map, 1, flags); } @@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp); + struct xfs_buf **bpp, + xfs_buf_iodone_t verify); static inline int xfs_trans_read_buf( @@ -486,13 +484,12 @@ xfs_trans_read_buf( xfs_daddr_t blkno, int numblks, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + xfs_buf_iodone_t verify) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; - return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp); + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, + flags, bpp, verify); } struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 6311b99c267..977628207b4 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -257,7 +257,8 @@ xfs_trans_read_buf_map( struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + xfs_buf_iodone_t verify) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; @@ -265,7 +266,7 @@ xfs_trans_read_buf_map( *bpp = NULL; if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, verify); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -312,7 +313,9 @@ xfs_trans_read_buf_map( if (!(XFS_BUF_ISDONE(bp))) { trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); + ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); + bp->b_iodone = verify; xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -349,7 +352,7 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, verify); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 81c61fd1789..26880793fec 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -80,7 +80,7 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; -- cgit v1.2.3-70-g09d2 From 0e446be44806240c779666591bb9e8cb0e86a50d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Nov 2012 22:54:24 +1100 Subject: xfs: add CRC checks to the log Implement CRCs for the log buffers. We re-use a field in struct xlog_rec_header that was used for a weak checksum of the log buffer payload in debug builds before. The new checksumming uses the crc32c checksum we will use elsewhere in XFS, and also protects the record header and addition cycle data. Due to this there are some interesting changes in xlog_sync, as we need to do the cycle wrapping for the split buffer case much earlier, as we would touch the buffer after generating the checksum otherwise. The CRC calculation is always enabled, even for non-CRC filesystems, as adding this CRC does not change the log format. On non-CRC filesystems, only issue an alert if a CRC mismatch is found and allow recovery to continue - this will act as an indicator that log recovery problems are a result of log corruption. On CRC enabled filesystems, however, log recovery will fail. Note that existing debug kernels will write a simple checksum value to the log, so the first time this is run on a filesystem taht was last used on a debug kernel it will through CRC mismatch warning errors. These can be ignored. Initially based on a patch from Dave Chinner, then modified significantly by Christoph Hellwig. Modified again by Dave Chinner to get to this version. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 132 ++++++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_log_priv.h | 11 ++-- fs/xfs/xfs_log_recover.c | 132 ++++++++++++++++++++++------------------------- 3 files changed, 176 insertions(+), 99 deletions(-) (limited to 'fs/xfs/xfs_log.c') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1d6d2ee0849..c6d6e136ba7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -35,6 +35,7 @@ #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_fsops.h" +#include "xfs_cksum.h" kmem_zone_t *xfs_log_ticket_zone; @@ -1489,6 +1490,84 @@ xlog_grant_push_ail( xfs_ail_push(log->l_ailp, threshold_lsn); } +/* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__be32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + /* * The bdstrat callback function for log bufs. This gives us a central * place to trap bufs in case we get hit by a log I/O error and need to @@ -1549,7 +1628,6 @@ xlog_sync( struct xlog *log, struct xlog_in_core *iclog) { - xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; int i; uint count; /* byte count of bwrite */ @@ -1558,6 +1636,7 @@ xlog_sync( int split = 0; /* split write into two regions */ int error; int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; XFS_STATS_INC(xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); @@ -1588,13 +1667,10 @@ xlog_sync( xlog_pack_data(log, iclog, roundoff); /* real byte length */ - if (v2) { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset + roundoff); - } else { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset); - } + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); @@ -1603,12 +1679,36 @@ xlog_sync( /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; /* split into 2 writes */ + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } } else { iclog->ic_bwritecnt = 1; } + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); @@ -1662,19 +1762,6 @@ xlog_sync( bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = bp->b_addr; - /* - * Bump the cycle numbers at the start of each block - * since this part of the buffer is at the start of - * a new cycle. Watch out for the header magic number - * case, though. - */ - for (i = 0; i < split; i += BBSIZE) { - be32_add_cpu((__be32 *)dptr, 1); - if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) - be32_add_cpu((__be32 *)dptr, 1); - dptr += BBSIZE; - } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1691,7 +1778,6 @@ xlog_sync( return 0; } /* xlog_sync */ - /* * Deallocate a log structure */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 9a4e0e5ec32..dc3498bf17c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i) /* * Flags for log structure */ -#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */ #define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being @@ -291,7 +290,7 @@ typedef struct xlog_rec_header { __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ __be64 h_lsn; /* lsn of this LR : 8 */ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - __be32 h_chksum; /* may not be used; non-zero if used : 4 */ + __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; @@ -555,11 +554,9 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); -extern void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int); + +extern __be32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket * diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 931e8e23f19..9c3651c9e75 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -41,6 +41,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_utils.h" +#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -3216,80 +3217,58 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - struct xlog *log, - struct xlog_in_core *iclog, - int size) -{ - int i; - __be32 *up; - uint chksum = 0; - - up = (__be32 *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - iclog->ic_header.h_chksum = cpu_to_be32(chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - xfs_caddr_t dp; - - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; + __be32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.\n", + be32_to_cpu(rhead->h_crc), + be32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } + + return 0; } -STATIC void +STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, xfs_caddr_t dp, struct xlog *log) { int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3306,6 +3285,8 @@ xlog_unpack_data( dp += BBSIZE; } } + + return 0; } STATIC int @@ -3437,9 +3418,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3549,9 +3534,14 @@ xlog_do_recovery_pass( if (error) goto bread_err2; } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3576,9 +3566,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } -- cgit v1.2.3-70-g09d2 From 437a255aa23766666aec78af63be4c253faa8d57 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:00 +1100 Subject: xfs: fix direct IO nested transaction deadlock. The direct IO path can do a nested transaction reservation when writing past the EOF. The first transaction is the append transaction for setting the filesize at IO completion, but we can also need a transaction for allocation of blocks. If the log is low on space due to reservations and small log, the append transaction can be granted after wating for space as the only active transaction in the system. This then attempts a reservation for an allocation, which there isn't space in the log for, and the reservation sleeps. The result is that there is nothing left in the system to wake up all the processes waiting for log space to come free. The stack trace that shows this deadlock is relatively innocuous: xlog_grant_head_wait xlog_grant_head_check xfs_log_reserve xfs_trans_reserve xfs_iomap_write_direct __xfs_get_blocks xfs_get_blocks_direct do_blockdev_direct_IO __blockdev_direct_IO xfs_vm_direct_IO generic_file_direct_write xfs_file_dio_aio_writ xfs_file_aio_write do_sync_write vfs_write This was discovered on a filesystem with a log of only 10MB, and a log stripe unit of 256k whih increased the base reservations by 512k. Hence a allocation transaction requires 1.2MB of log space to be available instead of only 260k, and so greatly increased the chance that there wouldn't be enough log space available for the nested transaction to succeed. The key to reproducing it is this mkfs command: mkfs.xfs -f -d agcount=16,su=256k,sw=12 -l su=256k,size=2560b $SCRATCH_DEV The test case was a 1000 fsstress processes running with random freeze and unfreezes every few seconds. Thanks to Eryu Guan (eguan@redhat.com) for writing the test that found this on a system with a somewhat unique default configuration.... cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 81 ++++++++++++++++++++----------------------------------- fs/xfs/xfs_log.c | 3 ++- 2 files changed, 31 insertions(+), 53 deletions(-) (limited to 'fs/xfs/xfs_log.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 71361da1f77..4111a40ebe1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* - * We will pass freeze protection with a transaction. So tell lockdep + * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], @@ -149,11 +149,13 @@ xfs_setfilesize( xfs_fsize_t isize; /* - * The transaction was allocated in the I/O submission thread, - * thus we need to mark ourselves as beeing in a transaction - * manually. + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction manually. + * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); @@ -187,7 +189,8 @@ xfs_finish_ioend( if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) + else if (ioend->io_append_trans || + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); @@ -205,15 +208,6 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; - if (ioend->io_append_trans) { - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - rwsem_acquire_read( - &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -226,35 +220,31 @@ xfs_end_io( * range to normal written extens after the data I/O has finished. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { /* - * For buffered I/O we never preallocate a transaction when - * doing the unwritten extent conversion, but for direct I/O - * we do not know if we are converting an unwritten extent - * or not at the point where we preallocate the transaction. + * For direct I/O we do not know if we need to allocate blocks + * or not so we can't preallocate an append transaction as that + * results in nested reservations and log space deadlocks. Hence + * allocate the transaction here. While this is sub-optimal and + * can block IO completion for some time, we're stuck with doing + * it this way until we can pass the ioend to the direct IO + * allocation callbacks and avoid nesting that way. */ - if (ioend->io_append_trans) { - ASSERT(ioend->io_isdirect); - - current_set_flags_nested( - &ioend->io_append_trans->t_pflags, PF_FSTRANS); - xfs_trans_cancel(ioend->io_append_trans, 0); - } - - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - if (error) { - ioend->io_error = -error; + error = xfs_setfilesize_trans_alloc(ioend); + if (error) goto done; - } + error = xfs_setfilesize(ioend); } else if (ioend->io_append_trans) { error = xfs_setfilesize(ioend); - if (error) - ioend->io_error = -error; } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: + if (error) + ioend->io_error = -error; xfs_destroy_ioend(ioend); } @@ -1432,25 +1422,21 @@ xfs_vm_direct_IO( size_t size = iov_length(iov, nr_segs); /* - * We need to preallocate a transaction for a size update - * here. In the case that this write both updates the size - * and converts at least on unwritten extent we will cancel - * the still clean transaction after the I/O has finished. + * We cannot preallocate a size update transaction here as we + * don't know whether allocation is necessary or not. Hence we + * can only tell IO completion that one is necessary if we are + * not doing unwritten extent conversion. */ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); - if (offset + size > XFS_I(inode)->i_d.di_size) { - ret = xfs_setfilesize_trans_alloc(ioend); - if (ret) - goto out_destroy_ioend; + if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - goto out_trans_cancel; + goto out_destroy_ioend; } else { ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1460,15 +1446,6 @@ xfs_vm_direct_IO( return ret; -out_trans_cancel: - if (ioend->io_append_trans) { - current_set_flags_nested(&ioend->io_append_trans->t_pflags, - PF_FSTRANS); - rwsem_acquire_read( - &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - xfs_trans_cancel(ioend->io_append_trans, 0); - } out_destroy_ioend: xfs_destroy_ioend(ioend); return ret; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c6d6e136ba7..c49e2c12dba 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -460,7 +460,8 @@ xfs_log_reserve( tic->t_trans_type = t_type; *ticp = tic; - xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); trace_xfs_log_reserve(log, tic); -- cgit v1.2.3-70-g09d2 From f9668a09e32ac6d2aa22f44cc310e430a8f4a40f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:03 +1100 Subject: xfs: fix sparse reported log CRC endian issue Not a bug as such, just warning noise from the xlog_cksum() returning a __be32 type when it should be returning a __le32 type. On Wed, Nov 28, 2012 at 08:30:59AM -0500, Christoph Hellwig wrote: > But why are we storing the crc field little endian while all other on > disk formats are big endian? (And yes I realize it might as well have > been me who did that back in the idea, but I still have no idea why) Because the CRC always returns the calcuation LE format, even on BE systems. So rather than always having to byte swap it everywhere and have all the force casts and anootations for sparse, it seems simpler to just make it a __le32 everywhere.... Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_priv.h | 2 +- fs/xfs/xfs_log_recover.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/xfs/xfs_log.c') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c49e2c12dba..46bd9d52ab5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1538,7 +1538,7 @@ xlog_pack_data( * This is a little more complicated than it should be because the various * headers and the actual data are non-contiguous. */ -__be32 +__le32 xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index dc3498bf17c..16d8d12ea3b 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -555,7 +555,7 @@ extern int xlog_recover_finish( struct xlog *log); -extern __be32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9c3651c9e75..96fcbb85ff8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3233,15 +3233,15 @@ xlog_unpack_data_crc( xfs_caddr_t dp, struct xlog *log) { - __be32 crc; + __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.\n", - be32_to_cpu(rhead->h_crc), - be32_to_cpu(crc)); + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } -- cgit v1.2.3-70-g09d2