diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 339 |
1 files changed, 206 insertions, 133 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 6fed97a8cd3..05cd85317f6 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -65,7 +65,6 @@ xfs_inode_ag_lookup( * as the tree is sparse and a gang lookup walks to find * the number of objects requested. */ - read_lock(&pag->pag_ici_lock); if (tag == XFS_ICI_NO_TAG) { nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)&ip, *first_index, 1); @@ -74,7 +73,7 @@ xfs_inode_ag_lookup( (void **)&ip, *first_index, 1, tag); } if (!nr_found) - goto unlock; + return NULL; /* * Update the index for the next lookup. Catch overflows @@ -84,25 +83,20 @@ xfs_inode_ag_lookup( */ *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - goto unlock; - + return NULL; return ip; - -unlock: - read_unlock(&pag->pag_ici_lock); - return NULL; } STATIC int xfs_inode_ag_walk( struct xfs_mount *mp, - xfs_agnumber_t ag, + struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, - int tag) + int tag, + int exclusive) { - struct xfs_perag *pag = &mp->m_perag[ag]; uint32_t first_index; int last_error = 0; int skipped; @@ -114,10 +108,20 @@ restart: int error = 0; xfs_inode_t *ip; + if (exclusive) + write_lock(&pag->pag_ici_lock); + else + read_lock(&pag->pag_ici_lock); ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); - if (!ip) + if (!ip) { + if (exclusive) + write_unlock(&pag->pag_ici_lock); + else + read_unlock(&pag->pag_ici_lock); break; + } + /* execute releases pag->pag_ici_lock */ error = execute(ip, pag, flags); if (error == EAGAIN) { skipped++; @@ -125,9 +129,8 @@ restart: } if (error) last_error = error; - /* - * bail out if the filesystem is corrupted. - */ + + /* bail out if the filesystem is corrupted. */ if (error == EFSCORRUPTED) break; @@ -137,8 +140,6 @@ restart: delay(1); goto restart; } - - xfs_put_perag(mp, pag); return last_error; } @@ -148,16 +149,24 @@ xfs_inode_ag_iterator( int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, - int tag) + int tag, + int exclusive) { int error = 0; int last_error = 0; xfs_agnumber_t ag; for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - if (!mp->m_perag[ag].pag_ici_init) + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, ag); + if (!pag->pag_ici_init) { + xfs_perag_put(pag); continue; - error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); + } + error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, + exclusive); + xfs_perag_put(pag); if (error) { last_error = error; if (error == EFSCORRUPTED) @@ -174,30 +183,31 @@ xfs_sync_inode_valid( struct xfs_perag *pag) { struct inode *inode = VFS_I(ip); + int error = EFSCORRUPTED; /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - read_unlock(&pag->pag_ici_lock); - return EFSCORRUPTED; - } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_unlock; - /* - * If we can't get a reference on the inode, it must be in reclaim. - * Leave it for the reclaim code to flush. Also avoid inodes that - * haven't been fully initialised. - */ - if (!igrab(inode)) { - read_unlock(&pag->pag_ici_lock); - return ENOENT; - } - read_unlock(&pag->pag_ici_lock); + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + error = ENOENT; + if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + goto out_unlock; - if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { + if (is_bad_inode(inode)) { IRELE(ip); - return ENOENT; + goto out_unlock; } - return 0; + /* inode is valid */ + error = 0; +out_unlock: + read_unlock(&pag->pag_ici_lock); + return error; } STATIC int @@ -224,7 +234,7 @@ xfs_sync_inode_data( } error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? - 0 : XFS_B_ASYNC, FI_NONE); + 0 : XBF_ASYNC, FI_NONE); xfs_iunlock(ip, XFS_IOLOCK_SHARED); out_wait: @@ -260,8 +270,7 @@ xfs_sync_inode_attr( goto out_unlock; } - error = xfs_iflush(ip, (flags & SYNC_WAIT) ? - XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI); + error = xfs_iflush(ip, flags); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -282,14 +291,11 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG); + XFS_ICI_NO_TAG, 0); if (error) return XFS_ERROR(error); - xfs_log_force(mp, 0, - (flags & SYNC_WAIT) ? - XFS_LOG_FORCE | XFS_LOG_SYNC : - XFS_LOG_FORCE); + xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); return 0; } @@ -304,7 +310,7 @@ xfs_sync_attr( ASSERT((flags & ~SYNC_WAIT) == 0); return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG); + XFS_ICI_NO_TAG, 0); } STATIC int @@ -315,10 +321,6 @@ xfs_commit_dummy_trans( struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; int error; - int log_flags = XFS_LOG_FORCE; - - if (flags & SYNC_WAIT) - log_flags |= XFS_LOG_SYNC; /* * Put a dummy transaction in the log to tell recovery @@ -340,11 +342,11 @@ xfs_commit_dummy_trans( xfs_iunlock(ip, XFS_ILOCK_EXCL); /* the log force ensures this transaction is pushed to disk */ - xfs_log_force(mp, 0, log_flags); + xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); return error; } -int +STATIC int xfs_sync_fsdata( struct xfs_mount *mp, int flags) @@ -360,7 +362,7 @@ xfs_sync_fsdata( if (flags & SYNC_TRYLOCK) { ASSERT(!(flags & SYNC_WAIT)); - bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); + bp = xfs_getsb(mp, XBF_TRYLOCK); if (!bp) goto out; @@ -380,7 +382,7 @@ xfs_sync_fsdata( * become pinned in between there and here. */ if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, 0, XFS_LOG_FORCE); + xfs_log_force(mp, 0); } @@ -441,9 +443,6 @@ xfs_quiesce_data( xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); - /* drop inode references pinned by filestreams */ - xfs_filestream_flush(mp); - /* write superblock and hoover up shutdown errors */ error = xfs_sync_fsdata(mp, SYNC_WAIT); @@ -460,16 +459,18 @@ xfs_quiesce_fs( { int count = 0, pincount; + xfs_reclaim_inodes(mp, 0); xfs_flush_buftarg(mp->m_ddev_targp, 0); - xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); /* * This loop must run at least twice. The first instance of the loop * will flush most meta data but that will generate more meta data * (typically directory updates). Which then must be flushed and - * logged before we can write the unmount record. + * logged before we can write the unmount record. We also so sync + * reclaim of inodes to catch any that the above delwri flush skipped. */ do { + xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_sync_attr(mp, SYNC_WAIT); pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); if (!pincount) { @@ -568,7 +569,7 @@ xfs_flush_inodes( igrab(inode); xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); wait_for_completion(&completion); - xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); + xfs_log_force(ip->i_mount, XFS_LOG_SYNC); } /* @@ -584,8 +585,8 @@ xfs_sync_worker( int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); + xfs_log_force(mp, 0); + xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); @@ -606,7 +607,8 @@ xfssyncd( set_freezable(); timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); for (;;) { - timeleft = schedule_timeout_interruptible(timeleft); + if (list_empty(&mp->m_sync_list)) + timeleft = schedule_timeout_interruptible(timeleft); /* swsusp */ try_to_freeze(); if (kthread_should_stop() && list_empty(&mp->m_sync_list)) @@ -626,8 +628,7 @@ xfssyncd( list_add_tail(&mp->m_sync_work.w_list, &mp->m_sync_list); } - list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) - list_move(&work->w_list, &tmp); + list_splice_init(&mp->m_sync_list, &tmp); spin_unlock(&mp->m_sync_lock); list_for_each_entry_safe(work, n, &tmp, w_list) { @@ -664,60 +665,6 @@ xfs_syncd_stop( kthread_stop(mp->m_sync_task); } -STATIC int -xfs_reclaim_inode( - xfs_inode_t *ip, - int sync_mode) -{ - xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); - - /* The hash lock here protects a thread in xfs_iget_core from - * racing with us on linking the inode back with a vnode. - * Once we have the XFS_IRECLAIM flag set it will not touch - * us. - */ - write_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - if (__xfs_iflags_test(ip, XFS_IRECLAIM) || - !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - return -EAGAIN; - } - __xfs_iflags_set(ip, XFS_IRECLAIM); - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - xfs_put_perag(ip->i_mount, pag); - - /* - * If the inode is still dirty, then flush it out. If the inode - * is not in the AIL, then it will be OK to flush it delwri as - * long as xfs_iflush() does not keep any references to the inode. - * We leave that decision up to xfs_iflush() since it has the - * knowledge of whether it's OK to simply do a delwri flush of - * the inode or whether we need to wait until the inode is - * pulled from the AIL. - * We get the flush lock regardless, though, just to make sure - * we don't free it while it is being flushed. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - - /* - * In the case of a forced shutdown we rely on xfs_iflush() to - * wait for the inode to be unpinned before returning an error. - */ - if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { - /* synchronize with xfs_iflush_done */ - xfs_iflock(ip); - xfs_ifunlock(ip); - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_ireclaim(ip); - return 0; -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, @@ -737,16 +684,17 @@ void xfs_inode_set_reclaim_tag( xfs_inode_t *ip) { - xfs_mount_t *mp = ip->i_mount; - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; - read_lock(&pag->pag_ici_lock); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + write_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); + write_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); } void @@ -759,20 +707,145 @@ __xfs_inode_clear_reclaim_tag( XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); } +/* + * Inodes in different states need to be treated differently, and the return + * value of xfs_iflush is not sufficient to get this right. The following table + * lists the inode states and the reclaim actions necessary for non-blocking + * reclaim: + * + * + * inode state iflush ret required action + * --------------- ---------- --------------- + * bad - reclaim + * shutdown EIO unpin and reclaim + * clean, unpinned 0 reclaim + * stale, unpinned 0 reclaim + * clean, pinned(*) 0 requeue + * stale, pinned EAGAIN requeue + * dirty, delwri ok 0 requeue + * dirty, delwri blocked EAGAIN requeue + * dirty, sync flush 0 reclaim + * + * (*) dgc: I don't think the clean, pinned state is possible but it gets + * handled anyway given the order of checks implemented. + * + * As can be seen from the table, the return value of xfs_iflush() is not + * sufficient to correctly decide the reclaim action here. The checks in + * xfs_iflush() might look like duplicates, but they are not. + * + * Also, because we get the flush lock first, we know that any inode that has + * been flushed delwri has had the flush completed by the time we check that + * the inode is clean. The clean inode check needs to be done before flushing + * the inode delwri otherwise we would loop forever requeuing clean inodes as + * we cannot tell apart a successful delwri flush and a clean inode from the + * return value of xfs_iflush(). + * + * Note that because the inode is flushed delayed write by background + * writeback, the flush lock may already be held here and waiting on it can + * result in very long latencies. Hence for sync reclaims, where we wait on the + * flush lock, the caller should push out delayed write inodes first before + * trying to reclaim them to minimise the amount of time spent waiting. For + * background relaim, we just requeue the inode for the next pass. + * + * Hence the order of actions after gaining the locks should be: + * bad => reclaim + * shutdown => unpin and reclaim + * pinned, delwri => requeue + * pinned, sync => unpin + * stale => reclaim + * clean => reclaim + * dirty, delwri => flush and requeue + * dirty, sync => flush, wait and reclaim + */ STATIC int -xfs_reclaim_inode_now( +xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, - int flags) + int sync_mode) { - /* ignore if already under reclaim */ - if (xfs_iflags_test(ip, XFS_IRECLAIM)) { - read_unlock(&pag->pag_ici_lock); + int error = 0; + + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + */ + spin_lock(&ip->i_flags_lock); + ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* ignore as it is already under reclaim */ + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); return 0; } - read_unlock(&pag->pag_ici_lock); + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iflock_nowait(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out; + xfs_iflock(ip); + } + + if (is_bad_inode(VFS_I(ip))) + goto reclaim; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + goto reclaim; + } + if (xfs_ipincount(ip)) { + if (!(sync_mode & SYNC_WAIT)) { + xfs_ifunlock(ip); + goto out; + } + xfs_iunpin_wait(ip); + } + if (xfs_iflags_test(ip, XFS_ISTALE)) + goto reclaim; + if (xfs_inode_clean(ip)) + goto reclaim; + + /* Now we have an inode that needs flushing */ + error = xfs_iflush(ip, sync_mode); + if (sync_mode & SYNC_WAIT) { + xfs_iflock(ip); + goto reclaim; + } + + /* + * When we have to flush an inode but don't have SYNC_WAIT set, we + * flush the inode out using a delwri buffer and wait for the next + * call into reclaim to find it in a clean state instead of waiting for + * it now. We also don't return errors here - if the error is transient + * then the next reclaim pass will flush the inode, and if the error + * is permanent then the next sync reclaim will relcaim the inode and + * pass on the error. + */ + if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "inode 0x%llx background reclaim flush failed with %d", + (long long)ip->i_ino, error); + } +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and xfssyncd never goes back to the idle + * state. Instead, return 0 to let the next scheduled background reclaim + * attempt to reclaim the inode again. + */ + return 0; + +reclaim: + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_ireclaim(ip); + return error; - return xfs_reclaim_inode(ip, flags); } int @@ -780,6 +853,6 @@ xfs_reclaim_inodes( xfs_mount_t *mp, int mode) { - return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, - XFS_ICI_RECLAIM_TAG); + return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, + XFS_ICI_RECLAIM_TAG, 1); } |