diff options
Diffstat (limited to 'fs/xfs')
41 files changed, 822 insertions, 1911 deletions
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 76e4266d2e7..ac702a6eab9 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -39,7 +39,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) struct posix_acl_entry *acl_e; struct posix_acl *acl; struct xfs_acl_entry *ace; - int count, i; + unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); if (count > XFS_ACL_MAX_ENTRIES) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 574d4ee9b62..74b9baf36ac 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -111,8 +111,7 @@ xfs_ioend_new_eof( xfs_fsize_t bsize; bsize = ioend->io_offset + ioend->io_size; - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); + isize = MIN(i_size_read(VFS_I(ip)), bsize); return isize > ip->i_d.di_size ? isize : 0; } @@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) } /* - * Update on-disk file size now that data has been written to disk. The - * current in-memory file size is i_size. If a write is beyond eof i_new_size - * will be the intended file size until i_size is updated. If this write does - * not extend all the way to the valid file size then restrict this update to - * the end of the write. + * Update on-disk file size now that data has been written to disk. * * This function does not block as blocking on the inode lock in IO completion * can lead to IO completion order dependency deadlocks.. If it can't get the @@ -1279,6 +1274,15 @@ xfs_end_io_direct_write( struct xfs_ioend *ioend = iocb->private; /* + * While the generic direct I/O code updates the inode size, it does + * so only after the end_io handler is called, which means our + * end_io handler thinks the on-disk size is outside the in-core + * size. To prevent this just update it a little bit earlier here. + */ + if (offset + size > i_size_read(ioend->io_inode)) + i_size_write(ioend->io_inode, offset + size); + + /* * blockdev_direct_IO can return an error even after the I/O * completion handler was called. Thus we need to protect * against double-freeing. @@ -1340,12 +1344,11 @@ xfs_vm_write_failed( if (to > inode->i_size) { /* - * punch out the delalloc blocks we have already allocated. We - * don't call xfs_setattr() to do this as we may be in the - * middle of a multi-iovec write and so the vfs inode->i_size - * will not match the xfs ip->i_size and so it will zero too - * much. Hence we jus truncate the page cache to zero what is - * necessary and punch the delalloc blocks directly. + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have + * made it to disk yet as the page is still locked at this + * point. */ struct xfs_inode *ip = XFS_I(inode); xfs_fileoff_t start_fsb; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 1e5d97f86ea..08b9ac644c3 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp) if (error) goto out; - /* - * Commit the last in the sequence of transactions. - */ - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index c1b55e59655..d25eafd4d28 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) dp = args->dp; mp = dp->i_mount; dp->i_d.di_forkoff = forkoff; - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); @@ -326,7 +322,6 @@ xfs_attr_fork_reset( ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_afp == NULL); - ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) (args->op_flags & XFS_DA_OP_ADDNAME) || !(mp->m_flags & XFS_MOUNT_ATTR2) || dp->i_d.di_format == XFS_DINODE_FMT_BTREE); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index d0ab7883705..188ef2fbd62 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge( } /* -* Update the record referred to by cur to the value given + * Check if the inode needs to be converted to btree format. + */ +static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Check if the inode should be converted to extent format. + */ +static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Update the record referred to by cur to the value given * by [off, bno, len, state]. * This either works (return 0) or gets an EFSCORRUPTED error. */ @@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); @@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* * Make space in the inode incore. */ @@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset( ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - if (dfl_forkoff > ip->i_d.di_forkoff) { + if (dfl_forkoff > ip->i_d.di_forkoff) ip->i_d.di_forkoff = dfl_forkoff; - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t); - } } } @@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork( int error; /* error return value */ ASSERT(XFS_IFORK_Q(ip) == 0); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); @@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork( error = XFS_ERROR(EINVAL); goto error1; } - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; xfs_bmap_init(&flist, &firstblock); @@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork( } else spin_unlock(&mp->m_sb_lock); } - if ((error = xfs_bmap_finish(&tp, &flist, &committed))) + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); - return error; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); error2: xfs_bmap_cancel(&flist); error1: xfs_iunlock(ip, XFS_ILOCK_EXCL); error0: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; } @@ -3994,11 +3997,8 @@ xfs_bmap_one_block( xfs_bmbt_irec_t s; /* internal version of extent */ #ifndef DEBUG - if (whichfork == XFS_DATA_FORK) { - return S_ISREG(ip->i_d.di_mode) ? - (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : - (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); - } + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; #endif /* !DEBUG */ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) return 0; @@ -4010,7 +4010,7 @@ xfs_bmap_one_block( xfs_bmbt_get_all(ep, &s); rval = s.br_startoff == 0 && s.br_blockcount == 1; if (rval && whichfork == XFS_DATA_FORK) - ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); return rval; } @@ -4379,8 +4379,6 @@ xfs_bmapi_read( XFS_STATS_INC(xs_blk_mapr); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); @@ -4871,8 +4869,6 @@ xfs_bmapi_write( return XFS_ERROR(EIO); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); XFS_STATS_INC(xs_blk_mapw); @@ -4981,8 +4977,7 @@ xfs_bmapi_write( /* * Transform from btree to extents, give it cur. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + if (xfs_bmap_wants_extents(ip, whichfork)) { int tmp_logflags = 0; ASSERT(bma.cur); @@ -4992,10 +4987,10 @@ xfs_bmapi_write( if (error) goto error0; } - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork)); error = 0; error0: /* @@ -5095,8 +5090,7 @@ xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; @@ -5322,7 +5316,8 @@ xfs_bunmapi( */ if (!wasdel && xfs_trans_get_block_res(tp) == 0 && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && + XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ + XFS_IFORK_MAXEXT(ip, whichfork) && del.br_startoff > got.br_startoff && del.br_startoff + del.br_blockcount < got.br_startoff + got.br_blockcount) { @@ -5353,13 +5348,11 @@ nodelete: } } *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* * Convert to a btree if necessary. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(ip, whichfork)) { ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, &tmp_logflags, whichfork); @@ -5370,8 +5363,7 @@ nodelete: /* * transform from btree to extents, give it cur */ - else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + else if (xfs_bmap_wants_extents(ip, whichfork)) { ASSERT(cur != NULL); error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, whichfork); @@ -5382,8 +5374,6 @@ nodelete: /* * transform from extents to local? */ - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); error = 0; error0: /* @@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole( if (startblock == HOLESTARTBLOCK) { mp = ip->i_mount; out->bmv_block = -1; - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); fixlen -= out->bmv_offset; if (prealloced && out->bmv_offset + out->bmv_length == end) { /* Came to hole at EOF. Trim it. */ @@ -5522,7 +5512,7 @@ xfs_getbmap( fixlen = XFS_MAXIOFFSET(mp); } else { prealloced = 0; - fixlen = ip->i_size; + fixlen = XFS_ISIZE(ip); } } @@ -5551,7 +5541,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { + if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); if (error) goto out_unlock_iolock; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cf0ac056815..4dff85c7d7e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1370,7 +1370,7 @@ restart: goto restart; } /* - * clear the LRU reference count so the bufer doesn't get + * clear the LRU reference count so the buffer doesn't get * ignored in xfs_buf_rele(). */ atomic_set(&bp->b_lru_ref, 0); @@ -1701,12 +1701,8 @@ xfsbufd( struct list_head tmp; struct blk_plug plug; - if (unlikely(freezing(current))) { - set_bit(XBT_FORCE_SLEEP, &target->bt_flags); - refrigerator(); - } else { - clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); - } + if (unlikely(freezing(current))) + try_to_freeze(); /* sleep for a long time if there is nothing to do. */ if (list_empty(&target->bt_delwri_queue)) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 5bab046e859..df7ffb0affe 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -90,8 +90,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_DELWRI_Q, "DELWRI_Q" } typedef enum { - XBT_FORCE_SLEEP = 0, - XBT_FORCE_FLUSH = 1, + XBT_FORCE_FLUSH = 0, } xfs_buftarg_flags_t; typedef struct xfs_buftarg { diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 654dc6f05ba..dd974a55c77 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -163,12 +163,14 @@ xfs_swap_extents_check_format( /* Check temp in extent form to max in target */ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return EINVAL; /* Check target in extent form to max in temp */ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return EINVAL; /* @@ -180,18 +182,25 @@ xfs_swap_extents_check_format( * (a common defrag case) which will occur when the temp inode is in * extent format... */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && - ((XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) - return EINVAL; + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + } /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - ((XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) - return EINVAL; + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + return EINVAL; + + if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + } return 0; } @@ -349,16 +358,6 @@ xfs_swap_extents( *tifp = *tempifp; /* struct copy */ /* - * Fix the in-memory data fork values that are dependent on the fork - * offset in the inode. We can't assume they remain the same as attr2 - * has dynamic fork offsets. - */ - ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / - (uint)sizeof(xfs_bmbt_rec_t); - tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / - (uint)sizeof(xfs_bmbt_rec_t); - - /* * Fix the on-disk inode values */ tmp = (__uint64_t)ip->i_d.di_nblocks; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8a24f0c6c86..286a051f12c 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -68,7 +68,7 @@ xfs_trim_extents( * Look up the longest btree in the AGF and start with it. */ error = xfs_alloc_lookup_le(cur, 0, - XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); if (error) goto out_del_cursor; @@ -84,7 +84,7 @@ xfs_trim_extents( if (error) goto out_del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); - ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* * Too small? Give up. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 25d7280e9f6..b4ff40b5f91 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -39,20 +39,19 @@ #include "xfs_qm.h" #include "xfs_trace.h" - /* - LOCK ORDER - - inode lock (ilock) - dquot hash-chain lock (hashlock) - xqm dquot freelist lock (freelistlock - mount's dquot list lock (mplistlock) - user dquot lock - lock ordering among dquots is based on the uid or gid - group dquot lock - similar to udquots. Between the two dquots, the udquot - has to be locked first. - pin lock - the dquot lock must be held to take this lock. - flush lock - ditto. -*/ + * Lock order: + * + * ip->i_lock + * qh->qh_lock + * qi->qi_dqlist_lock + * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_flush (xfs_dqflock() and friends) + * xfs_Gqm->qm_dqfrlist_lock + * + * If two dquots need to be locked the order is user before group/project, + * otherwise by the lowest id first, see xfs_dqlock2. + */ #ifdef DEBUG xfs_buftarg_t *xfs_dqerror_target; @@ -155,24 +154,6 @@ xfs_qm_dqdestroy( } /* - * This is what a 'fresh' dquot inside a dquot chunk looks like on disk. - */ -STATIC void -xfs_qm_dqinit_core( - xfs_dqid_t id, - uint type, - xfs_dqblk_t *d) -{ - /* - * Caller has zero'd the entire dquot 'chunk' already. - */ - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_id = cpu_to_be32(id); - d->dd_diskdq.d_flags = type; -} - -/* * If default limits are in force, push them into the dquot now. * We overwrite the dquot limits only if they are zero and this * is not the root dquot. @@ -328,8 +309,13 @@ xfs_qm_init_dquot_blk( curid = id - (id % q->qi_dqperchunk); ASSERT(curid >= 0); memset(d, 0, BBTOB(q->qi_dqchunklen)); - for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) - xfs_qm_dqinit_core(curid, type, d); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_id = cpu_to_be32(curid); + d->dd_diskdq.d_flags = type; + } + xfs_trans_dquot_buf(tp, bp, (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : @@ -564,36 +550,62 @@ xfs_qm_dqtobp( * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. * + * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. */ -/* ARGSUSED */ -STATIC int +int xfs_qm_dqread( - xfs_trans_t **tpp, - xfs_dqid_t id, - xfs_dquot_t *dqp, /* dquot to get filled in */ - uint flags) + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + uint flags, + struct xfs_dquot **O_dqpp) { - xfs_disk_dquot_t *ddqp; - xfs_buf_t *bp; - int error; - xfs_trans_t *tp; + struct xfs_dquot *dqp; + struct xfs_disk_dquot *ddqp; + struct xfs_buf *bp; + struct xfs_trans *tp = NULL; + int error; + int cancelflags = 0; - ASSERT(tpp); + dqp = xfs_qm_dqinit(mp, id, type); trace_xfs_dqread(dqp); + if (flags & XFS_QMOPT_DQALLOC) { + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); + error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + /* + * Round the chunklen up to the next multiple + * of 128 (buf log item chunk size)). + */ + BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) + goto error1; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + } + /* * get a pointer to the on-disk dquot and the buffer containing it * dqp already knows its own type (GROUP/USER). */ - if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) { - return (error); + error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); + if (error) { + /* + * This can happen if quotas got turned off (ESRCH), + * or if the dquot didn't exist on disk and we ask to + * allocate (ENOENT). + */ + trace_xfs_dqread_fail(dqp); + cancelflags |= XFS_TRANS_ABORT; + goto error1; } - tp = *tpp; /* copy everything from disk dquot to the incore dquot */ memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); xfs_qm_dquot_logitem_init(dqp); /* @@ -622,77 +634,22 @@ xfs_qm_dqread( ASSERT(xfs_buf_islocked(bp)); xfs_trans_brelse(tp, bp); - return (error); -} - - -/* - * allocate an incore dquot from the kernel heap, - * and fill its core with quota information kept on disk. - * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk - * if it wasn't already allocated. - */ -STATIC int -xfs_qm_idtodq( - xfs_mount_t *mp, - xfs_dqid_t id, /* gid or uid, depending on type */ - uint type, /* UDQUOT or GDQUOT */ - uint flags, /* DQALLOC, DQREPAIR */ - xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */ -{ - xfs_dquot_t *dqp; - int error; - xfs_trans_t *tp; - int cancelflags=0; - - dqp = xfs_qm_dqinit(mp, id, type); - tp = NULL; - if (flags & XFS_QMOPT_DQALLOC) { - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + - 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - if (error) { - cancelflags = 0; - goto error0; - } - cancelflags = XFS_TRANS_RELEASE_LOG_RES; - } - - /* - * Read it from disk; xfs_dqread() takes care of - * all the necessary initialization of dquot's fields (locks, etc) - */ - if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) { - /* - * This can happen if quotas got turned off (ESRCH), - * or if the dquot didn't exist on disk and we ask to - * allocate (ENOENT). - */ - trace_xfs_dqread_fail(dqp); - cancelflags |= XFS_TRANS_ABORT; - goto error0; - } if (tp) { - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) - goto error1; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; } *O_dqpp = dqp; - return (0); + return error; - error0: - ASSERT(error); +error1: if (tp) xfs_trans_cancel(tp, cancelflags); - error1: +error0: xfs_qm_dqdestroy(dqp); *O_dqpp = NULL; - return (error); + return error; } /* @@ -710,12 +667,9 @@ xfs_qm_dqlookup( xfs_dquot_t **O_dqpp) { xfs_dquot_t *dqp; - uint flist_locked; ASSERT(mutex_is_locked(&qh->qh_lock)); - flist_locked = B_FALSE; - /* * Traverse the hashchain looking for a match */ @@ -725,70 +679,31 @@ xfs_qm_dqlookup( * dqlock to look at the id field of the dquot, since the * id can't be modified without the hashlock anyway. */ - if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) { - trace_xfs_dqlookup_found(dqp); + if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp) + continue; - /* - * All in core dquots must be on the dqlist of mp - */ - ASSERT(!list_empty(&dqp->q_mplist)); - - xfs_dqlock(dqp); - if (dqp->q_nrefs == 0) { - ASSERT(!list_empty(&dqp->q_freelist)); - if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { - trace_xfs_dqlookup_want(dqp); - - /* - * We may have raced with dqreclaim_one() - * (and lost). So, flag that we don't - * want the dquot to be reclaimed. - */ - dqp->dq_flags |= XFS_DQ_WANT; - xfs_dqunlock(dqp); - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - xfs_dqlock(dqp); - dqp->dq_flags &= ~(XFS_DQ_WANT); - } - flist_locked = B_TRUE; - } + trace_xfs_dqlookup_found(dqp); - /* - * id couldn't have changed; we had the hashlock all - * along - */ - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); - - if (flist_locked) { - if (dqp->q_nrefs != 0) { - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - flist_locked = B_FALSE; - } else { - /* take it off the freelist */ - trace_xfs_dqlookup_freelist(dqp); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - } - } + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + *O_dqpp = NULL; + xfs_dqunlock(dqp); + return -1; + } - XFS_DQHOLD(dqp); + dqp->q_nrefs++; - if (flist_locked) - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - /* - * move the dquot to the front of the hashchain - */ - ASSERT(mutex_is_locked(&qh->qh_lock)); - list_move(&dqp->q_hashlist, &qh->qh_list); - trace_xfs_dqlookup_done(dqp); - *O_dqpp = dqp; - return 0; - } + /* + * move the dquot to the front of the hashchain + */ + list_move(&dqp->q_hashlist, &qh->qh_list); + trace_xfs_dqlookup_done(dqp); + *O_dqpp = dqp; + return 0; } *O_dqpp = NULL; - ASSERT(mutex_is_locked(&qh->qh_lock)); - return (1); + return 1; } /* @@ -829,11 +744,7 @@ xfs_qm_dqget( return (EIO); } } -#endif - - again: -#ifdef DEBUG ASSERT(type == XFS_DQ_USER || type == XFS_DQ_PROJ || type == XFS_DQ_GROUP); @@ -845,13 +756,21 @@ xfs_qm_dqget( ASSERT(ip->i_gdquot == NULL); } #endif + +restart: mutex_lock(&h->qh_lock); /* * Look in the cache (hashtable). * The chain is kept locked during lookup. */ - if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) { + switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) { + case -1: + XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); + mutex_unlock(&h->qh_lock); + delay(1); + goto restart; + case 0: XQM_STATS_INC(xqmstats.xs_qm_dqcachehits); /* * The dquot was found, moved to the front of the chain, @@ -862,9 +781,11 @@ xfs_qm_dqget( ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); mutex_unlock(&h->qh_lock); trace_xfs_dqget_hit(*O_dqpp); - return (0); /* success */ + return 0; /* success */ + default: + XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); + break; } - XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -882,41 +803,18 @@ xfs_qm_dqget( version = h->qh_version; mutex_unlock(&h->qh_lock); - /* - * Allocate the dquot on the kernel heap, and read the ondisk - * portion off the disk. Also, do all the necessary initialization - * This can return ENOENT if dquot didn't exist on disk and we didn't - * ask it to allocate; ESRCH if quotas got turned off suddenly. - */ - if ((error = xfs_qm_idtodq(mp, id, type, - flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR| - XFS_QMOPT_DOWARN), - &dqp))) { - if (ip) - xfs_ilock(ip, XFS_ILOCK_EXCL); - return (error); - } + error = xfs_qm_dqread(mp, id, type, flags, &dqp); - /* - * See if this is mount code calling to look at the overall quota limits - * which are stored in the id == 0 user or group's dquot. - * Since we may not have done a quotacheck by this point, just return - * the dquot without attaching it to any hashtables, lists, etc, or even - * taking a reference. - * The caller must dqdestroy this once done. - */ - if (flags & XFS_QMOPT_DQSUSER) { - ASSERT(id == 0); - ASSERT(! ip); - goto dqret; - } + if (ip) + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (error) + return error; /* * Dquot lock comes after hashlock in the lock ordering */ if (ip) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. @@ -961,16 +859,21 @@ xfs_qm_dqget( * lock order between the two dquots here since dqp isn't * on any findable lists yet. */ - if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) { + switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) { + case 0: + case -1: /* - * Duplicate found. Just throw away the new dquot - * and start over. + * Duplicate found, either in cache or on its way out. + * Just throw away the new dquot and start over. */ - xfs_qm_dqput(tmpdqp); + if (tmpdqp) + xfs_qm_dqput(tmpdqp); mutex_unlock(&h->qh_lock); xfs_qm_dqdestroy(dqp); XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); - goto again; + goto restart; + default: + break; } } @@ -1015,67 +918,49 @@ xfs_qm_dqget( */ void xfs_qm_dqput( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { - xfs_dquot_t *gdqp; + struct xfs_dquot *gdqp; ASSERT(dqp->q_nrefs > 0); ASSERT(XFS_DQ_IS_LOCKED(dqp)); trace_xfs_dqput(dqp); - if (dqp->q_nrefs != 1) { - dqp->q_nrefs--; +recurse: + if (--dqp->q_nrefs > 0) { xfs_dqunlock(dqp); return; } + trace_xfs_dqput_free(dqp); + + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + if (list_empty(&dqp->q_freelist)) { + list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + xfs_Gqm->qm_dqfrlist_cnt++; + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + /* - * drop the dqlock and acquire the freelist and dqlock - * in the right order; but try to get it out-of-order first + * If we just added a udquot to the freelist, then we want to release + * the gdquot reference that it (probably) has. Otherwise it'll keep + * the gdquot from getting reclaimed. */ - if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { - trace_xfs_dqput_wait(dqp); - xfs_dqunlock(dqp); - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - xfs_dqlock(dqp); + gdqp = dqp->q_gdquot; + if (gdqp) { + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; } + xfs_dqunlock(dqp); - while (1) { - gdqp = NULL; - - /* We can't depend on nrefs being == 1 here */ - if (--dqp->q_nrefs == 0) { - trace_xfs_dqput_free(dqp); - - list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); - xfs_Gqm->qm_dqfrlist_cnt++; - - /* - * If we just added a udquot to the freelist, then - * we want to release the gdquot reference that - * it (probably) has. Otherwise it'll keep the - * gdquot from getting reclaimed. - */ - if ((gdqp = dqp->q_gdquot)) { - /* - * Avoid a recursive dqput call - */ - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - } - xfs_dqunlock(dqp); - - /* - * If we had a group quota inside the user quota as a hint, - * release it now. - */ - if (! gdqp) - break; + /* + * If we had a group quota hint, release it now. + */ + if (gdqp) { dqp = gdqp; + goto recurse; } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); } /* @@ -1169,7 +1054,7 @@ xfs_qm_dqflush( * If not dirty, or it's pinned and we are not supposed to block, nada. */ if (!XFS_DQ_IS_DIRTY(dqp) || - (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { + ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) { xfs_dqfunlock(dqp); return 0; } @@ -1257,40 +1142,17 @@ xfs_qm_dqflush( } -int -xfs_qm_dqlock_nowait( - xfs_dquot_t *dqp) -{ - return mutex_trylock(&dqp->q_qlock); -} - -void -xfs_dqlock( - xfs_dquot_t *dqp) -{ - mutex_lock(&dqp->q_qlock); -} - void xfs_dqunlock( xfs_dquot_t *dqp) { - mutex_unlock(&(dqp->q_qlock)); + xfs_dqunlock_nonotify(dqp); if (dqp->q_logitem.qli_dquot == dqp) { - /* Once was dqp->q_mount, but might just have been cleared */ xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp, - (xfs_log_item_t*)&(dqp->q_logitem)); + &dqp->q_logitem.qli_item); } } - -void -xfs_dqunlock_nonotify( - xfs_dquot_t *dqp) -{ - mutex_unlock(&(dqp->q_qlock)); -} - /* * Lock two xfs_dquot structures. * @@ -1319,43 +1181,18 @@ xfs_dqlock2( } } - /* - * Take a dquot out of the mount's dqlist as well as the hashlist. - * This is called via unmount as well as quotaoff, and the purge - * will always succeed unless there are soft (temp) references - * outstanding. - * - * This returns 0 if it was purged, 1 if it wasn't. It's not an error code - * that we're returning! XXXsup - not cool. + * Take a dquot out of the mount's dqlist as well as the hashlist. This is + * called via unmount as well as quotaoff, and the purge will always succeed. */ -/* ARGSUSED */ -int +void xfs_qm_dqpurge( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { - xfs_dqhash_t *qh = dqp->q_hash; - xfs_mount_t *mp = dqp->q_mount; - - ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock)); - ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); + struct xfs_mount *mp = dqp->q_mount; + struct xfs_dqhash *qh = dqp->q_hash; xfs_dqlock(dqp); - /* - * We really can't afford to purge a dquot that is - * referenced, because these are hard refs. - * It shouldn't happen in general because we went thru _all_ inodes in - * dqrele_all_inodes before calling this and didn't let the mountlock go. - * However it is possible that we have dquots with temporary - * references that are not attached to an inode. e.g. see xfs_setattr(). - */ - if (dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - mutex_unlock(&dqp->q_hash->qh_lock); - return (1); - } - - ASSERT(!list_empty(&dqp->q_freelist)); /* * If we're turning off quotas, we have to make sure that, for @@ -1370,23 +1207,18 @@ xfs_qm_dqpurge( * Block on the flush lock after nudging dquot buffer, * if it is incore. */ - xfs_qm_dqflock_pushbuf_wait(dqp); + xfs_dqflock_pushbuf_wait(dqp); } /* - * XXXIf we're turning this type of quotas off, we don't care + * If we are turning this type of quotas off, we don't care * about the dirty metadata sitting in this dquot. OTOH, if * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { int error; - /* dqflush unlocks dqflock */ /* - * Given that dqpurge is a very rare occurrence, it is OK - * that we're holding the hashlist and mplist locks - * across the disk write. But, ... XXXsup - * * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ @@ -1396,38 +1228,44 @@ xfs_qm_dqpurge( __func__, dqp); xfs_dqflock(dqp); } + ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + + mutex_lock(&qh->qh_lock); list_del_init(&dqp->q_hashlist); qh->qh_version++; + mutex_unlock(&qh->qh_lock); + + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); list_del_init(&dqp->q_mplist); mp->m_quotainfo->qi_dqreclaims++; mp->m_quotainfo->qi_dquots--; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + /* - * XXX Move this to the front of the freelist, if we can get the - * freelist lock. + * We move dquots to the freelist as soon as their reference count + * hits zero, so it really should be on the freelist here. */ + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); ASSERT(!list_empty(&dqp->q_freelist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - dqp->q_mount = NULL; - dqp->q_hash = NULL; - dqp->dq_flags = XFS_DQ_INACTIVE; - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - mutex_unlock(&qh->qh_lock); - return (0); + xfs_qm_dqdestroy(dqp); } - /* * Give the buffer a little push if it is incore and * wait on the flush lock. */ void -xfs_qm_dqflock_pushbuf_wait( +xfs_dqflock_pushbuf_wait( xfs_dquot_t *dqp) { xfs_mount_t *mp = dqp->q_mount; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 34b7e945dbf..a1d91d8f180 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -80,8 +80,6 @@ enum { XFS_QLOCK_NESTED, }; -#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) - /* * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to @@ -102,6 +100,21 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) complete(&dqp->q_flush); } +static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) +{ + return mutex_trylock(&dqp->q_qlock); +} + +static inline void xfs_dqlock(struct xfs_dquot *dqp) +{ + mutex_lock(&dqp->q_qlock); +} + +static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp) +{ + mutex_unlock(&dqp->q_qlock); +} + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) @@ -116,12 +129,12 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ (XFS_IS_OQUOTA_ON((d)->q_mount)))) +extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, + uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(xfs_dquot_t *, uint); -extern int xfs_qm_dqpurge(xfs_dquot_t *); +extern void xfs_qm_dqpurge(xfs_dquot_t *); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); -extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); -extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, @@ -129,9 +142,17 @@ extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, xfs_dqid_t, uint, uint, xfs_dquot_t **); extern void xfs_qm_dqput(xfs_dquot_t *); -extern void xfs_dqlock(xfs_dquot_t *); -extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *); -extern void xfs_dqunlock(xfs_dquot_t *); -extern void xfs_dqunlock_nonotify(xfs_dquot_t *); + +extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +extern void xfs_dqunlock(struct xfs_dquot *); +extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp); + +static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) +{ + xfs_dqlock(dqp); + dqp->q_nrefs++; + xfs_dqunlock(dqp); + return dqp; +} #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 0dee0b71029..34baeae4526 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -73,7 +73,6 @@ xfs_qm_dquot_logitem_format( logvec->i_len = sizeof(xfs_disk_dquot_t); logvec->i_type = XLOG_REG_TYPE_DQUOT; - ASSERT(2 == lip->li_desc->lid_size); qlip->qli_format.qlf_size = 2; } @@ -134,7 +133,7 @@ xfs_qm_dquot_logitem_push( * lock without sleeping, then there must not have been * anyone in the process of flushing the dquot. */ - error = xfs_qm_dqflush(dqp, 0); + error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); if (error) xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", __func__, error, dqp); @@ -237,7 +236,7 @@ xfs_qm_dquot_logitem_trylock( if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - if (!xfs_qm_dqlock_nowait(dqp)) + if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; if (!xfs_dqflock_nowait(dqp)) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 753ed9b5c70..7e5bc872f2b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -209,10 +209,10 @@ xfs_file_fsync( /* * First check if the VFS inode is marked dirty. All the dirtying - * of non-transactional updates no goes through mark_inode_dirty*, - * which allows us to distinguish beteeen pure timestamp updates + * of non-transactional updates do not go through mark_inode_dirty*, + * which allows us to distinguish between pure timestamp updates * and i_size updates which need to be caught for fdatasync. - * After that also theck for the dirty state in the XFS inode, which + * After that also check for the dirty state in the XFS inode, which * might gets cleared when the inode gets written out via the AIL * or xfs_iflush_cluster. */ @@ -327,7 +327,7 @@ xfs_file_aio_read( mp->m_rtdev_targp : mp->m_ddev_targp; if ((iocb->ki_pos & target->bt_smask) || (size & target->bt_smask)) { - if (iocb->ki_pos == ip->i_size) + if (iocb->ki_pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } @@ -412,51 +412,6 @@ xfs_file_splice_read( return ret; } -STATIC void -xfs_aio_write_isize_update( - struct inode *inode, - loff_t *ppos, - ssize_t bytes_written) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t isize = i_size_read(inode); - - if (bytes_written > 0) - XFS_STATS_ADD(xs_write_bytes, bytes_written); - - if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && - *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - } -} - -/* - * If this was a direct or synchronous I/O that failed (such as ENOSPC) then - * part of the I/O may have been written to disk before the error occurred. In - * this case the on-disk file size may have been adjusted beyond the in-memory - * file size and now needs to be truncated back. - */ -STATIC void -xfs_aio_write_newsize_update( - struct xfs_inode *ip, - xfs_fsize_t new_size) -{ - if (new_size == ip->i_new_size) { - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - if (new_size == ip->i_new_size) - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - } -} - /* * xfs_file_splice_write() does not use xfs_rw_ilock() because * generic_file_splice_write() takes the i_mutex itself. This, in theory, @@ -475,7 +430,6 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -489,19 +443,12 @@ xfs_file_splice_write( xfs_ilock(ip, XFS_IOLOCK_EXCL); - new_size = *ppos + count; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (new_size > ip->i_size) - ip->i_new_size = new_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); - xfs_aio_write_isize_update(inode, ppos, ret); - xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -689,28 +636,26 @@ out_lock: /* * Common pre-write limit and setup checks. * - * Returns with iolock held according to @iolock. + * Called with the iolocked held either shared and exclusive according to + * @iolock, and returns with it held. Might upgrade the iolock to exclusive + * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, - xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t new_size; int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - *new_sizep = 0; restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); - *iolock = 0; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -720,36 +665,21 @@ restart: /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. There is no need to issue zeroing if another in-flght IO ends - * at or before this one If zeronig is needed and we are currently - * holding the iolock shared, we need to update it to exclusive which - * involves dropping all locks and relocking to maintain correct locking - * order. If we do this, restart the function to ensure all checks and - * values are still valid. + * write. If zeroing is needed and we are currently holding the + * iolock shared, we need to update it to exclusive which involves + * dropping all locks and relocking to maintain correct locking order. + * If we do this, restart the function to ensure all checks and values + * are still valid. */ - if ((ip->i_new_size && *pos > ip->i_new_size) || - (!ip->i_new_size && *pos > ip->i_size)) { + if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); goto restart; } - error = -xfs_zero_eof(ip, *pos, ip->i_size); + error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); } - - /* - * If this IO extends beyond EOF, we may need to update ip->i_new_size. - * We have already zeroed space beyond EOF (if necessary). Only update - * ip->i_new_size if this IO ends beyond any other in-flight writes. - */ - new_size = *pos + *count; - if (new_size > ip->i_size) { - if (new_size > ip->i_new_size) - ip->i_new_size = new_size; - *new_sizep = new_size; - } - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; @@ -794,9 +724,7 @@ xfs_file_dio_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount, - xfs_fsize_t *new_size, - int *iolock) + size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -806,10 +734,10 @@ xfs_file_dio_aio_write( ssize_t ret = 0; size_t count = ocount; int unaligned_io = 0; + int iolock; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - *iolock = 0; if ((pos & target->bt_smask) || (count & target->bt_smask)) return -XFS_ERROR(EINVAL); @@ -824,31 +752,31 @@ xfs_file_dio_aio_write( * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) - *iolock = XFS_IOLOCK_EXCL; + iolock = XFS_IOLOCK_EXCL; else - *iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, *iolock); + iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ - if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, iolock); } - ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) - return ret; + goto out; if (mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) - return ret; + goto out; } /* @@ -857,15 +785,18 @@ xfs_file_dio_aio_write( */ if (unaligned_io) inode_dio_wait(inode); - else if (*iolock == XFS_IOLOCK_EXCL) { + else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - *iolock = XFS_IOLOCK_SHARED; + iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); +out: + xfs_rw_iunlock(ip, iolock); + /* No fallback to buffered IO on errors for XFS. */ ASSERT(ret < 0 || ret == count); return ret; @@ -877,9 +808,7 @@ xfs_file_buffered_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount, - xfs_fsize_t *new_size, - int *iolock) + size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -887,14 +816,14 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; + int iolock = XFS_IOLOCK_EXCL; size_t count = ocount; - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_rw_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) - return ret; + goto out; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -908,13 +837,15 @@ write_retry: * page locks and retry *once* */ if (ret == -ENOSPC && !enospc) { - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (ret) - return ret; enospc = 1; - goto write_retry; + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (!ret) + goto write_retry; } + current->backing_dev_info = NULL; +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -930,9 +861,7 @@ xfs_file_aio_write( struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int iolock; size_t ocount = 0; - xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -951,33 +880,22 @@ xfs_file_aio_write( return -EIO; if (unlikely(file->f_flags & O_DIRECT)) - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &new_size, &iolock); + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount, &new_size, &iolock); - - xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); + ocount); - if (ret <= 0) - goto out_unlock; + if (ret > 0) { + ssize_t err; - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error; + XFS_STATS_ADD(xs_write_bytes, ret); - xfs_rw_iunlock(ip, iolock); - error = xfs_file_fsync(file, pos, end, - (file->f_flags & __O_SYNC) ? 0 : 1); - xfs_rw_ilock(ip, iolock); - if (error) - ret = error; + /* Handle various SYNC-type writes */ + err = generic_write_sync(file, pos, ret); + if (err < 0) + ret = err; } -out_unlock: - xfs_aio_write_newsize_update(ip, new_size); - xfs_rw_iunlock(ip, iolock); return ret; } diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index ed88ed16811..652b875a9d4 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -90,7 +90,7 @@ xfs_wait_on_pages( if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { return -filemap_fdatawait_range(mapping, first, - last == -1 ? ip->i_size - 1 : last); + last == -1 ? XFS_ISIZE(ip) - 1 : last); } return 0; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 169380e6605..dad1a31aa4f 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -447,7 +447,7 @@ STATIC xfs_buf_t * /* allocation group buffer */ xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - mode_t mode, /* bits set to indicate file type */ + umode_t mode, /* bits set to indicate file type */ int okalloc) /* ok to allocate more space */ { xfs_buf_t *agbp; /* allocation group header buffer */ @@ -640,7 +640,7 @@ int xfs_dialloc( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ + umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ boolean_t *alloc_done, /* true if we needed to replenish diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index bb5385475e1..666a037398d 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -81,7 +81,7 @@ int /* error */ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ + umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ boolean_t *alloc_done, /* an allocation was done to replenish diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0fa98b1c70e..8c3e46394d4 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -77,7 +77,7 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); + ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); @@ -94,8 +94,6 @@ xfs_inode_alloc( ip->i_update_core = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); - ip->i_size = 0; - ip->i_new_size = 0; return ip; } @@ -107,7 +105,6 @@ xfs_inode_free_callback( struct inode *inode = container_of(head, struct inode, i_rcu); struct xfs_inode *ip = XFS_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_zone_free(xfs_inode_zone, ip); } @@ -151,7 +148,7 @@ xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); + ASSERT(!xfs_isiflocked(ip)); /* * Because we use RCU freeing we need to ensure the inode always @@ -451,8 +448,6 @@ again: *ipp = ip; - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); /* * If we have a real type for an on-disk inode, we can set ops(&unlock) * now. If it's a new inode being created, xfs_ialloc will handle it. @@ -716,3 +711,19 @@ xfs_isilocked( return 0; } #endif + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 755ee816488..b21022499c2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -299,11 +299,8 @@ xfs_iformat( { xfs_attr_shortform_t *atp; int size; - int error; + int error = 0; xfs_fsize_t di_size; - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - error = 0; if (unlikely(be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > @@ -350,7 +347,6 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } ip->i_d.di_size = 0; - ip->i_size = 0; ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); break; @@ -409,10 +405,10 @@ xfs_iformat( } if (!XFS_DFORK_Q(dip)) return 0; + ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); @@ -604,10 +600,11 @@ xfs_iformat_btree( * or the number of extents is greater than the number of * blocks. */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max - || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) - || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, @@ -835,12 +832,6 @@ xfs_iread( * with the uninitialized part of it. */ ip->i_d.di_mode = 0; - /* - * Initialize the per-fork minima and maxima for a new - * inode here. xfs_iformat will do it for old inodes. - */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); } /* @@ -861,7 +852,6 @@ xfs_iread( } ip->i_delayed_blks = 0; - ip->i_size = ip->i_d.di_size; /* * Mark the buffer containing the inode as something to keep @@ -961,7 +951,7 @@ int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, - mode_t mode, + umode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, prid_t prid, @@ -1002,7 +992,7 @@ xfs_ialloc( return error; ASSERT(ip != NULL); - ip->i_d.di_mode = (__uint16_t)mode; + ip->i_d.di_mode = mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); @@ -1051,7 +1041,6 @@ xfs_ialloc( } ip->i_d.di_size = 0; - ip->i_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); @@ -1166,52 +1155,6 @@ xfs_ialloc( } /* - * Check to make sure that there are no blocks allocated to the - * file beyond the size of the file. We don't check this for - * files with fixed size extents or real time extents, but we - * at least do it for regular files. - */ -#ifdef DEBUG -STATIC void -xfs_isize_check( - struct xfs_inode *ip, - xfs_fsize_t isize) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t map_first; - int nimaps; - xfs_bmbt_irec_t imaps[2]; - int error; - - if (!S_ISREG(ip->i_d.di_mode)) - return; - - if (XFS_IS_REALTIME_INODE(ip)) - return; - - if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) - return; - - nimaps = 2; - map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - /* - * The filesystem could be shutting down, so bmapi may return - * an error. - */ - error = xfs_bmapi_read(ip, map_first, - (XFS_B_TO_FSB(mp, - (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), - imaps, &nimaps, XFS_BMAPI_ENTIRE); - if (error) - return; - ASSERT(nimaps == 1); - ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); -} -#else /* DEBUG */ -#define xfs_isize_check(ip, isize) -#endif /* DEBUG */ - -/* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and * data fork, and does not modify the inode size, which is left to the caller. @@ -1252,12 +1195,14 @@ xfs_itruncate_extents( int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(new_size <= ip->i_size); + ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); ASSERT(ip->i_itemp->ili_lock_flags == 0); ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + trace_xfs_itruncate_extents_start(ip, new_size); + /* * Since it is possible for space to become allocated beyond * the end of the file (in a crash where the space is allocated @@ -1325,6 +1270,14 @@ xfs_itruncate_extents( goto out; } + /* + * Always re-log the inode so that our permanent transaction can keep + * on rolling it forward in the log. + */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + trace_xfs_itruncate_extents_end(ip, new_size); + out: *tpp = tp; return error; @@ -1338,74 +1291,6 @@ out_bmap_cancel: goto out; } -int -xfs_itruncate_data( - struct xfs_trans **tpp, - struct xfs_inode *ip, - xfs_fsize_t new_size) -{ - int error; - - trace_xfs_itruncate_data_start(ip, new_size); - - /* - * The first thing we do is set the size to new_size permanently on - * disk. This way we don't have to worry about anyone ever being able - * to look at the data being freed even in the face of a crash. - * What we're getting around here is the case where we free a block, it - * is allocated to another file, it is written to, and then we crash. - * If the new data gets written to the file but the log buffers - * containing the free and reallocation don't, then we'd end up with - * garbage in the blocks being freed. As long as we make the new_size - * permanent before actually freeing any blocks it doesn't matter if - * they get written to. - */ - if (ip->i_d.di_nextents > 0) { - /* - * If we are not changing the file size then do not update - * the on-disk file size - we may be called from - * xfs_inactive_free_eofblocks(). If we update the on-disk - * file size and then the system crashes before the contents - * of the file are flushed to disk then the files may be - * full of holes (ie NULL files bug). - */ - if (ip->i_size != new_size) { - ip->i_d.di_size = new_size; - ip->i_size = new_size; - xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); - } - } - - error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size); - if (error) - return error; - - /* - * If we are not changing the file size then do not update the on-disk - * file size - we may be called from xfs_inactive_free_eofblocks(). - * If we update the on-disk file size and then the system crashes - * before the contents of the file are flushed to disk then the files - * may be full of holes (ie NULL files bug). - */ - xfs_isize_check(ip, new_size); - if (ip->i_size != new_size) { - ip->i_d.di_size = new_size; - ip->i_size = new_size; - } - - ASSERT(new_size != 0 || ip->i_delayed_blks == 0); - ASSERT(new_size != 0 || ip->i_d.di_nextents == 0); - - /* - * Always re-log the inode so that our permanent transaction can keep - * on rolling it forward in the log. - */ - xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); - - trace_xfs_itruncate_data_end(ip, new_size); - return 0; -} - /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -1824,8 +1709,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); - ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || - (!S_ISREG(ip->i_d.di_mode))); + ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -1844,8 +1728,6 @@ xfs_ifree( ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; /* @@ -2151,7 +2033,7 @@ xfs_idestroy_fork( * once someone is waiting for it to be unpinned. */ static void -xfs_iunpin_nowait( +xfs_iunpin( struct xfs_inode *ip) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); @@ -2163,14 +2045,29 @@ xfs_iunpin_nowait( } +static void +__xfs_iunpin_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); + + xfs_iunpin(ip); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_ipincount(ip)) + io_schedule(); + } while (xfs_ipincount(ip)); + finish_wait(wq, &wait.wait); +} + void xfs_iunpin_wait( struct xfs_inode *ip) { - if (xfs_ipincount(ip)) { - xfs_iunpin_nowait(ip); - wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); - } + if (xfs_ipincount(ip)) + __xfs_iunpin_wait(ip); } /* @@ -2510,9 +2407,9 @@ xfs_iflush( XFS_STATS_INC(xs_iflush_count); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); iip = ip->i_itemp; mp = ip->i_mount; @@ -2529,7 +2426,7 @@ xfs_iflush( * out for us if they occur after the log force completes. */ if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { - xfs_iunpin_nowait(ip); + xfs_iunpin(ip); xfs_ifunlock(ip); return EAGAIN; } @@ -2626,9 +2523,9 @@ xfs_iflush_int( #endif ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); iip = ip->i_itemp; mp = ip->i_mount; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b4cd4739f98..2f27b745408 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -66,7 +66,6 @@ typedef struct xfs_ifork { struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ - unsigned char if_ext_max; /* max # of extent records */ union { xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ @@ -206,12 +205,12 @@ typedef struct xfs_icdinode { ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_nextents = (n)) : \ ((ip)->i_d.di_anextents = (n))) - +#define XFS_IFORK_MAXEXT(ip, w) \ + (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) #ifdef __KERNEL__ -struct bhv_desc; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; @@ -220,12 +219,6 @@ struct xfs_mount; struct xfs_trans; struct xfs_dquot; -typedef struct dm_attrs_s { - __uint32_t da_dmevmask; /* DMIG event mask */ - __uint16_t da_dmstate; /* DMIG state info */ - __uint16_t da_pad; /* DMIG extra padding */ -} dm_attrs_t; - typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ @@ -244,27 +237,19 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ - struct completion i_flush; /* inode flush completion q */ atomic_t i_pincount; /* inode pin count */ - wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ - unsigned short i_flags; /* see defined flags below */ + unsigned long i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ - xfs_fsize_t i_size; /* in-memory size */ - xfs_fsize_t i_new_size; /* size when write completes */ - /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; -#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ - (ip)->i_size : (ip)->i_d.di_size; - /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { @@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) } /* + * For regular files we only update the on-disk filesize when actually + * writing data back to disk. Until then only the copy in the VFS inode + * is uptodate. + */ +static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) +{ + if (S_ISREG(ip->i_d.di_mode)) + return i_size_read(VFS_I(ip)); + return ip->i_d.di_size; +} + +/* * i_flags helper functions */ static inline void @@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) return ret; } +static inline int +xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (!ret) + ip->i_flags |= flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + /* * Project quota id helpers (previously projid was 16bit only * and using two 16bit values to hold new 32bit projid was chosen @@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip, } /* - * Manage the i_flush queue embedded in the inode. This completion - * queue synchronizes processes attempting to flush the in-core - * inode back to disk. - */ -static inline void xfs_iflock(xfs_inode_t *ip) -{ - wait_for_completion(&ip->i_flush); -} - -static inline int xfs_iflock_nowait(xfs_inode_t *ip) -{ - return try_wait_for_completion(&ip->i_flush); -} - -static inline void xfs_ifunlock(xfs_inode_t *ip) -{ - complete(&ip->i_flush); -} - -/* * In-core inode flags. */ -#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ -#define XFS_ISTALE 0x0002 /* inode has been staled */ -#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ -#define XFS_INEW 0x0008 /* inode has just been allocated */ -#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ -#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ -#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ +#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ +#define XFS_ISTALE (1 << 1) /* inode has been staled */ +#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ +#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */ +#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ +#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ +#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ +#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) XFS_IFILESTREAM); /* + * Synchronize processes attempting to flush the in-core inode back to disk. + */ + +extern void __xfs_iflock(struct xfs_inode *ip); + +static inline int xfs_iflock_nowait(struct xfs_inode *ip) +{ + return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); +} + +static inline void xfs_iflock(struct xfs_inode *ip) +{ + if (!xfs_iflock_nowait(ip)) + __xfs_iflock(ip); +} + +static inline void xfs_ifunlock(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_IFLOCK); + wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); +} + +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + +/* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) @@ -481,7 +503,7 @@ void xfs_inode_free(struct xfs_inode *ip); /* * xfs_inode.c prototypes. */ -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, +int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, boolean_t *, xfs_inode_t **); @@ -491,8 +513,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t); -int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *, - xfs_fsize_t); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index abaafdbb3e6..91d71dcd485 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -79,8 +79,6 @@ xfs_inode_item_size( break; case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); iip->ili_format.ilf_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV | XFS_ILOG_UUID); @@ -437,7 +435,6 @@ xfs_inode_item_format( * Assert that no attribute-related log flags are set. */ if (!XFS_IFORK_Q(ip)) { - ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; ASSERT(!(iip->ili_format.ilf_fields & (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); @@ -521,7 +518,6 @@ xfs_inode_item_format( break; } - ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; } @@ -559,7 +555,7 @@ xfs_inode_item_unpin( trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) - wake_up(&ip->i_ipin_wait); + wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } /* @@ -721,7 +717,7 @@ xfs_inode_item_pushbuf( * If a flush is not in progress anymore, chances are that the * inode was taken off the AIL. So, just get out. */ - if (completion_done(&ip->i_flush) || + if (!xfs_isiflocked(ip) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return true; @@ -754,7 +750,7 @@ xfs_inode_item_push( struct xfs_inode *ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); /* * Since we were able to lock the inode's flush lock and diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d99a9051890..76f3ca5cfc3 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -559,23 +559,23 @@ xfs_attrmulti_by_handle( ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( dentry->d_inode, attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( dentry->d_inode, attr_name, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; default: ops[i].am_error = EINVAL; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 54e623bfbb8..f9ccb7b7c04 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -454,23 +454,23 @@ xfs_compat_attrmulti_by_handle( &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( dentry->d_inode, attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( dentry->d_inode, attr_name, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; default: ops[i].am_error = EINVAL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 9afa282aa93..246c7d57c6f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb( xfs_fileoff_t *last_fsb) { xfs_fileoff_t new_last_fsb = 0; - xfs_extlen_t align; + xfs_extlen_t align = 0; int eof, error; - if (XFS_IS_REALTIME_INODE(ip)) - ; - /* - * If mounted with the "-o swalloc" option, roundup the allocation - * request to a stripe width boundary if the file size is >= - * stripe width and we are allocating past the allocation eof. - */ - else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && - (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) - new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); - /* - * Roundup the allocation request to a stripe unit (m_dalign) boundary - * if the file size is >= stripe unit size, and we are allocating past - * the allocation eof. - */ - else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) - new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); + if (!XFS_IS_REALTIME_INODE(ip)) { + /* + * Round up the allocation request to a stripe unit + * (m_dalign) boundary if the file size is >= stripe unit + * size, and we are allocating past the allocation eof. + * + * If mounted with the "-o swalloc" option the alignment is + * increased from the strip unit size to the stripe width. + */ + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + align = mp->m_swidth; + else if (mp->m_dalign) + align = mp->m_dalign; + + if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) + new_last_fsb = roundup_64(*last_fsb, align); + } /* * Always round up the allocation request to an extent boundary @@ -154,7 +154,7 @@ xfs_iomap_write_direct( offset_fsb = XFS_B_TO_FSBT(mp, offset); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - if ((offset + count) > ip->i_size) { + if ((offset + count) > XFS_ISIZE(ip)) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) goto error_out; @@ -211,7 +211,7 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip, 0); bmapi_flag = 0; - if (offset < ip->i_size || extsz) + if (offset < XFS_ISIZE(ip) || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* @@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate( int found_delalloc = 0; *prealloc = 0; - if ((offset + count) <= ip->i_size) + if (offset + count <= XFS_ISIZE(ip)) return 0; /* @@ -340,7 +340,7 @@ xfs_iomap_prealloc_size( * if we pass in alloc_blocks = 0. Hence the "+ 1" to * ensure we always pass in a non-zero value. */ - alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; + alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); @@ -564,7 +564,7 @@ xfs_iomap_write_allocate( * back.... */ nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, ip->i_size); + end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); error = xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); if (error) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 23ce927973a..ab302539e5b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -168,7 +168,7 @@ STATIC int xfs_vn_mknod( struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, dev_t rdev) { struct inode *inode; @@ -231,7 +231,7 @@ STATIC int xfs_vn_create( struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, struct nameidata *nd) { return xfs_vn_mknod(dir, dentry, mode, 0); @@ -241,7 +241,7 @@ STATIC int xfs_vn_mkdir( struct inode *dir, struct dentry *dentry, - int mode) + umode_t mode) { return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); } @@ -366,7 +366,7 @@ xfs_vn_symlink( struct xfs_inode *cip = NULL; struct xfs_name name; int error; - mode_t mode; + umode_t mode; mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); @@ -750,6 +750,7 @@ xfs_setattr_size( struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; + xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags; @@ -777,11 +778,13 @@ xfs_setattr_size( lock_flags |= XFS_IOLOCK_EXCL; xfs_ilock(ip, lock_flags); + oldsize = inode->i_size; + newsize = iattr->ia_size; + /* * Short circuit the truncate case for zero length files. */ - if (iattr->ia_size == 0 && - ip->i_size == 0 && ip->i_d.di_nextents == 0) { + if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) goto out_unlock; @@ -807,14 +810,14 @@ xfs_setattr_size( * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ - if (iattr->ia_size > ip->i_size) { + if (newsize > oldsize) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ - error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); + error = xfs_zero_eof(ip, newsize, oldsize); if (error) goto out_unlock; } @@ -833,8 +836,8 @@ xfs_setattr_size( * here and prevents waiting for other data not within the range we * care about here. */ - if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, + if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { + error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, FI_NONE); if (error) goto out_unlock; @@ -845,8 +848,7 @@ xfs_setattr_size( */ inode_dio_wait(inode); - error = -block_truncate_page(inode->i_mapping, iattr->ia_size, - xfs_get_blocks); + error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) goto out_unlock; @@ -857,7 +859,7 @@ xfs_setattr_size( if (error) goto out_trans_cancel; - truncate_setsize(inode, iattr->ia_size); + truncate_setsize(inode, newsize); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; @@ -876,19 +878,29 @@ xfs_setattr_size( * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (iattr->ia_size != ip->i_size && - (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } - if (iattr->ia_size > ip->i_size) { - ip->i_d.di_size = iattr->ia_size; - ip->i_size = iattr->ia_size; - } else if (iattr->ia_size <= ip->i_size || - (iattr->ia_size == 0 && ip->i_d.di_nextents)) { - error = xfs_itruncate_data(&tp, ip, iattr->ia_size); + /* + * The first thing we do is set the size to new_size permanently on + * disk. This way we don't have to worry about anyone ever being able + * to look at the data being freed even in the face of a crash. + * What we're getting around here is the case where we free a block, it + * is allocated to another file, it is written to, and then we crash. + * If the new data gets written to the file but the log buffers + * containing the free and reallocation don't, then we'd end up with + * garbage in the blocks being freed. As long as we make the new size + * permanent before actually freeing any blocks it doesn't matter if + * they get written to. + */ + ip->i_d.di_size = newsize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (newsize <= oldsize) { + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_abort; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 34817adf4b9..e2cc3568c29 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -760,38 +760,6 @@ xfs_log_item_init( INIT_LIST_HEAD(&item->li_cil); } -/* - * Write region vectors to log. The write happens using the space reservation - * of the ticket (tic). It is not a requirement that all writes for a given - * transaction occur with one call to xfs_log_write(). However, it is important - * to note that the transaction reservation code makes an assumption about the - * number of log headers a transaction requires that may be violated if you - * don't pass all the transaction vectors in one call.... - */ -int -xfs_log_write( - struct xfs_mount *mp, - struct xfs_log_iovec reg[], - int nentries, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn) -{ - struct log *log = mp->m_log; - int error; - struct xfs_log_vec vec = { - .lv_niovecs = nentries, - .lv_iovecp = reg, - }; - - if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); - - error = xlog_write(log, &vec, tic, start_lsn, NULL, 0); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return error; -} - void xfs_log_move_tail(xfs_mount_t *mp, xfs_lsn_t tail_lsn) @@ -1685,7 +1653,7 @@ xlog_print_tic_res( }; xfs_warn(mp, - "xfs_log_write: reservation summary:\n" + "xlog_write: reservation summary:\n" " trans type = %s (%u)\n" " unit res = %d bytes\n" " current res = %d bytes\n" @@ -1714,7 +1682,7 @@ xlog_print_tic_res( } xfs_alert_tag(mp, XFS_PTAG_LOGRES, - "xfs_log_write: reservation ran out. Need to up reservation"); + "xlog_write: reservation ran out. Need to up reservation"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } @@ -1968,23 +1936,21 @@ xlog_write( *start_lsn = 0; len = xlog_write_calc_vec_length(ticket, log_vector); - if (log->l_cilp) { - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. - */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - } else - ticket->t_curr_res -= len; + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); if (ticket->t_curr_res < 0) xlog_print_tic_res(log->l_mp, ticket); @@ -2931,8 +2897,7 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); - if (log->l_cilp) - xlog_cil_force(log); + xlog_cil_force(log); spin_lock(&log->l_icloglock); @@ -3081,11 +3046,9 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); - if (log->l_cilp) { - lsn = xlog_cil_force_lsn(log, lsn); - if (lsn == NULLCOMMITLSN) - return 0; - } + lsn = xlog_cil_force_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; try_again: spin_lock(&log->l_icloglock); @@ -3653,7 +3616,7 @@ xfs_log_force_umount( * completed transactions are flushed to disk with the xfs_log_force() * call below. */ - if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + if (!logerror) xlog_cil_force(log); /* diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 3f7bf451c03..2aee3b22d29 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -174,11 +174,6 @@ int xfs_log_reserve(struct xfs_mount *mp, __uint8_t clientid, uint flags, uint t_type); -int xfs_log_write(struct xfs_mount *mp, - xfs_log_iovec_t region[], - int nentries, - struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); @@ -189,8 +184,7 @@ void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_log_vec *log_vector, +int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index c7755d5a5fb..d4fadbe8ac9 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -32,10 +32,7 @@ #include "xfs_discard.h" /* - * Perform initial CIL structure initialisation. If the CIL is not - * enabled in this filesystem, ensure the log->l_cilp is null so - * we can check this conditional to determine if we are doing delayed - * logging or not. + * Perform initial CIL structure initialisation. */ int xlog_cil_init( @@ -44,10 +41,6 @@ xlog_cil_init( struct xfs_cil *cil; struct xfs_cil_ctx *ctx; - log->l_cilp = NULL; - if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) - return 0; - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); if (!cil) return ENOMEM; @@ -80,9 +73,6 @@ void xlog_cil_destroy( struct log *log) { - if (!log->l_cilp) - return; - if (log->l_cilp->xc_ctx) { if (log->l_cilp->xc_ctx->ticket) xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); @@ -137,9 +127,6 @@ void xlog_cil_init_post_recovery( struct log *log) { - if (!log->l_cilp) - return; - log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, @@ -172,37 +159,73 @@ xlog_cil_init_post_recovery( * format the regions into the iclog as though they are being formatted * directly out of the objects themselves. */ -static void -xlog_cil_format_items( - struct log *log, - struct xfs_log_vec *log_vector) +static struct xfs_log_vec * +xlog_cil_prepare_log_vecs( + struct xfs_trans *tp) { - struct xfs_log_vec *lv; + struct xfs_log_item_desc *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; - ASSERT(log_vector); - for (lv = log_vector; lv; lv = lv->lv_next) { + + /* Bail out if we didn't find a log item. */ + if (list_empty(&tp->t_items)) { + ASSERT(0); + return NULL; + } + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_vec *new_lv; void *ptr; int index; int len = 0; + uint niovecs; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* Skip items that do not have any vectors for writing */ + niovecs = IOP_SIZE(lidp->lid_item); + if (!niovecs) + continue; + + new_lv = kmem_zalloc(sizeof(*new_lv) + + niovecs * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = niovecs; + new_lv->lv_item = lidp->lid_item; /* build the vector array and calculate it's length */ - IOP_FORMAT(lv->lv_item, lv->lv_iovecp); - for (index = 0; index < lv->lv_niovecs; index++) - len += lv->lv_iovecp[index].i_len; + IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); + for (index = 0; index < new_lv->lv_niovecs; index++) + len += new_lv->lv_iovecp[index].i_len; - lv->lv_buf_len = len; - lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); - ptr = lv->lv_buf; + new_lv->lv_buf_len = len; + new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len, + KM_SLEEP|KM_NOFS); + ptr = new_lv->lv_buf; - for (index = 0; index < lv->lv_niovecs; index++) { - struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + for (index = 0; index < new_lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index]; memcpy(ptr, vec->i_addr, vec->i_len); vec->i_addr = ptr; ptr += vec->i_len; } - ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); + + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; } + + return ret_lv; } /* @@ -256,7 +279,7 @@ xfs_cil_prepare_item( * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate * if the change requires additional log metadata. If it does, take that space - * as well. Remove the amount of space we addded to the checkpoint ticket from + * as well. Remove the amount of space we added to the checkpoint ticket from * the current transaction ticket so that the accounting works out correctly. */ static void @@ -635,28 +658,30 @@ out_abort: * background commit, returns without it held once background commits are * allowed again. */ -void +int xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_log_vec *log_vector, xfs_lsn_t *commit_lsn, int flags) { struct log *log = mp->m_log; int log_flags = 0; int push = 0; + struct xfs_log_vec *log_vector; if (flags & XFS_TRANS_RELEASE_LOG_RES) log_flags = XFS_LOG_REL_PERM_RESERV; /* - * do all the hard work of formatting items (including memory + * Do all the hard work of formatting items (including memory * allocation) outside the CIL context lock. This prevents stalling CIL * pushes when we are low on memory and a transaction commit spends a * lot of time in memory reclaim. */ - xlog_cil_format_items(log, log_vector); + log_vector = xlog_cil_prepare_log_vecs(tp); + if (!log_vector) + return ENOMEM; /* lock out background commit */ down_read(&log->l_cilp->xc_ctx_lock); @@ -709,6 +734,7 @@ xfs_log_commit_cil( */ if (push) xlog_cil_push(log, 0); + return 0; } /* @@ -786,8 +812,6 @@ xfs_log_item_in_current_chkpt( { struct xfs_cil_ctx *ctx; - if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) - return false; if (list_empty(&lip->li_cil)) return false; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bb24dac42a2..19f69e23250 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -219,7 +219,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ -#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 0bbb1a41998..671f37eae1c 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -154,12 +154,17 @@ STATIC void xfs_qm_destroy( struct xfs_qm *xqm) { - struct xfs_dquot *dqp, *n; int hsize, i; ASSERT(xqm != NULL); ASSERT(xqm->qm_nrefs == 0); + unregister_shrinker(&xfs_qm_shaker); + + mutex_lock(&xqm->qm_dqfrlist_lock); + ASSERT(list_empty(&xqm->qm_dqfrlist)); + mutex_unlock(&xqm->qm_dqfrlist_lock); + hsize = xqm->qm_dqhashmask + 1; for (i = 0; i < hsize; i++) { xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); @@ -171,17 +176,6 @@ xfs_qm_destroy( xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; - /* frlist cleanup */ - mutex_lock(&xqm->qm_dqfrlist_lock); - list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { - xfs_dqlock(dqp); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - } - mutex_unlock(&xqm->qm_dqfrlist_lock); - mutex_destroy(&xqm->qm_dqfrlist_lock); kmem_free(xqm); } @@ -232,34 +226,10 @@ STATIC void xfs_qm_rele_quotafs_ref( struct xfs_mount *mp) { - xfs_dquot_t *dqp, *n; - ASSERT(xfs_Gqm); ASSERT(xfs_Gqm->qm_nrefs > 0); /* - * Go thru the freelist and destroy all inactive dquots. - */ - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - - list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) { - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(list_empty(&dqp->q_hashlist)); - ASSERT(list_empty(&dqp->q_mplist)); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - } else { - xfs_dqunlock(dqp); - } - } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - - /* * Destroy the entire XQM. If somebody mounts with quotaon, this'll * be restarted. */ @@ -415,8 +385,7 @@ xfs_qm_unmount_quotas( */ STATIC int xfs_qm_dqflush_all( - struct xfs_mount *mp, - int sync_mode) + struct xfs_mount *mp) { struct xfs_quotainfo *q = mp->m_quotainfo; int recl; @@ -429,7 +398,8 @@ again: mutex_lock(&q->qi_dqlist_lock); list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); - if (! XFS_DQ_IS_DIRTY(dqp)) { + if ((dqp->dq_flags & XFS_DQ_FREEING) || + !XFS_DQ_IS_DIRTY(dqp)) { xfs_dqunlock(dqp); continue; } @@ -444,14 +414,14 @@ again: * out immediately. We'll be able to acquire * the flush lock when the I/O completes. */ - xfs_qm_dqflock_pushbuf_wait(dqp); + xfs_dqflock_pushbuf_wait(dqp); } /* * Let go of the mplist lock. We don't want to hold it * across a disk write. */ mutex_unlock(&q->qi_dqlist_lock); - error = xfs_qm_dqflush(dqp, sync_mode); + error = xfs_qm_dqflush(dqp, 0); xfs_dqunlock(dqp); if (error) return error; @@ -468,6 +438,7 @@ again: /* return ! busy */ return 0; } + /* * Release the group dquot pointers the user dquots may be * carrying around as a hint. mplist is locked on entry and exit. @@ -478,31 +449,26 @@ xfs_qm_detach_gdquots( { struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dquot *dqp, *gdqp; - int nrecl; again: ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); - if ((gdqp = dqp->q_gdquot)) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - xfs_dqunlock(dqp); - - if (gdqp) { - /* - * Can't hold the mplist lock across a dqput. - * XXXmust convert to marker based iterations here. - */ - nrecl = q->qi_dqreclaims; + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); mutex_unlock(&q->qi_dqlist_lock); - xfs_qm_dqput(gdqp); - + delay(1); mutex_lock(&q->qi_dqlist_lock); - if (nrecl != q->qi_dqreclaims) - goto again; + goto again; } + + gdqp = dqp->q_gdquot; + if (gdqp) + dqp->q_gdquot = NULL; + xfs_dqunlock(dqp); + + if (gdqp) + xfs_qm_dqrele(gdqp); } } @@ -520,8 +486,8 @@ xfs_qm_dqpurge_int( struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dquot *dqp, *n; uint dqtype; - int nrecl; - int nmisses; + int nmisses = 0; + LIST_HEAD (dispose_list); if (!q) return 0; @@ -540,47 +506,26 @@ xfs_qm_dqpurge_int( */ xfs_qm_detach_gdquots(mp); - again: - nmisses = 0; - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); /* - * Try to get rid of all of the unwanted dquots. The idea is to - * get them off mplist and hashlist, but leave them on freelist. + * Try to get rid of all of the unwanted dquots. */ list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { - /* - * It's OK to look at the type without taking dqlock here. - * We're holding the mplist lock here, and that's needed for - * a dqreclaim. - */ - if ((dqp->dq_flags & dqtype) == 0) - continue; - - if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - nrecl = q->qi_dqreclaims; - mutex_unlock(&q->qi_dqlist_lock); - mutex_lock(&dqp->q_hash->qh_lock); - mutex_lock(&q->qi_dqlist_lock); - - /* - * XXXTheoretically, we can get into a very long - * ping pong game here. - * No one can be adding dquots to the mplist at - * this point, but somebody might be taking things off. - */ - if (nrecl != q->qi_dqreclaims) { - mutex_unlock(&dqp->q_hash->qh_lock); - goto again; - } + xfs_dqlock(dqp); + if ((dqp->dq_flags & dqtype) != 0 && + !(dqp->dq_flags & XFS_DQ_FREEING)) { + if (dqp->q_nrefs == 0) { + dqp->dq_flags |= XFS_DQ_FREEING; + list_move_tail(&dqp->q_mplist, &dispose_list); + } else + nmisses++; } - - /* - * Take the dquot off the mplist and hashlist. It may remain on - * freelist in INACTIVE state. - */ - nmisses += xfs_qm_dqpurge(dqp); + xfs_dqunlock(dqp); } mutex_unlock(&q->qi_dqlist_lock); + + list_for_each_entry_safe(dqp, n, &dispose_list, q_mplist) + xfs_qm_dqpurge(dqp); + return nmisses; } @@ -648,12 +593,9 @@ xfs_qm_dqattach_one( */ dqp = udqhint->q_gdquot; if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { - xfs_dqlock(dqp); - XFS_DQHOLD(dqp); ASSERT(*IO_idqpp == NULL); - *IO_idqpp = dqp; - xfs_dqunlock(dqp); + *IO_idqpp = xfs_qm_dqhold(dqp); xfs_dqunlock(udqhint); return 0; } @@ -693,11 +635,7 @@ xfs_qm_dqattach_one( /* * Given a udquot and gdquot, attach a ptr to the group dquot in the - * udquot as a hint for future lookups. The idea sounds simple, but the - * execution isn't, because the udquot might have a group dquot attached - * already and getting rid of that gets us into lock ordering constraints. - * The process is complicated more by the fact that the dquots may or may not - * be locked on entry. + * udquot as a hint for future lookups. */ STATIC void xfs_qm_dqattach_grouphint( @@ -708,45 +646,17 @@ xfs_qm_dqattach_grouphint( xfs_dqlock(udq); - if ((tmp = udq->q_gdquot)) { - if (tmp == gdq) { - xfs_dqunlock(udq); - return; - } + tmp = udq->q_gdquot; + if (tmp) { + if (tmp == gdq) + goto done; udq->q_gdquot = NULL; - /* - * We can't keep any dqlocks when calling dqrele, - * because the freelist lock comes before dqlocks. - */ - xfs_dqunlock(udq); - /* - * we took a hard reference once upon a time in dqget, - * so give it back when the udquot no longer points at it - * dqput() does the unlocking of the dquot. - */ xfs_qm_dqrele(tmp); - - xfs_dqlock(udq); - xfs_dqlock(gdq); - - } else { - ASSERT(XFS_DQ_IS_LOCKED(udq)); - xfs_dqlock(gdq); - } - - ASSERT(XFS_DQ_IS_LOCKED(udq)); - ASSERT(XFS_DQ_IS_LOCKED(gdq)); - /* - * Somebody could have attached a gdquot here, - * when we dropped the uqlock. If so, just do nothing. - */ - if (udq->q_gdquot == NULL) { - XFS_DQHOLD(gdq); - udq->q_gdquot = gdq; } - xfs_dqunlock(gdq); + udq->q_gdquot = xfs_qm_dqhold(gdq); +done: xfs_dqunlock(udq); } @@ -813,17 +723,13 @@ xfs_qm_dqattach_locked( ASSERT(ip->i_gdquot); /* - * We may or may not have the i_udquot locked at this point, - * but this check is OK since we don't depend on the i_gdquot to - * be accurate 100% all the time. It is just a hint, and this - * will succeed in general. - */ - if (ip->i_udquot->q_gdquot == ip->i_gdquot) - goto done; - /* - * Attach i_gdquot to the gdquot hint inside the i_udquot. + * We do not have i_udquot locked at this point, but this check + * is OK since we don't depend on the i_gdquot to be accurate + * 100% all the time. It is just a hint, and this will + * succeed in general. */ - xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); + if (ip->i_udquot->q_gdquot != ip->i_gdquot) + xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); } done: @@ -879,100 +785,6 @@ xfs_qm_dqdetach( } } -int -xfs_qm_sync( - struct xfs_mount *mp, - int flags) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - int recl, restarts; - struct xfs_dquot *dqp; - int error; - - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) - return 0; - - restarts = 0; - - again: - mutex_lock(&q->qi_dqlist_lock); - /* - * dqpurge_all() also takes the mplist lock and iterate thru all dquots - * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared - * when we have the mplist lock, we know that dquots will be consistent - * as long as we have it locked. - */ - if (!XFS_IS_QUOTA_ON(mp)) { - mutex_unlock(&q->qi_dqlist_lock); - return 0; - } - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); - list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { - /* - * If this is vfs_sync calling, then skip the dquots that - * don't 'seem' to be dirty. ie. don't acquire dqlock. - * This is very similar to what xfs_sync does with inodes. - */ - if (flags & SYNC_TRYLOCK) { - if (!XFS_DQ_IS_DIRTY(dqp)) - continue; - if (!xfs_qm_dqlock_nowait(dqp)) - continue; - } else { - xfs_dqlock(dqp); - } - - /* - * Now, find out for sure if this dquot is dirty or not. - */ - if (! XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* XXX a sentinel would be better */ - recl = q->qi_dqreclaims; - if (!xfs_dqflock_nowait(dqp)) { - if (flags & SYNC_TRYLOCK) { - xfs_dqunlock(dqp); - continue; - } - /* - * If we can't grab the flush lock then if the caller - * really wanted us to give this our best shot, so - * see if we can give a push to the buffer before we wait - * on the flush lock. At this point, we know that - * even though the dquot is being flushed, - * it has (new) dirty data. - */ - xfs_qm_dqflock_pushbuf_wait(dqp); - } - /* - * Let go of the mplist lock. We don't want to hold it - * across a disk write - */ - mutex_unlock(&q->qi_dqlist_lock); - error = xfs_qm_dqflush(dqp, flags); - xfs_dqunlock(dqp); - if (error && XFS_FORCED_SHUTDOWN(mp)) - return 0; /* Need to prevent umount failure */ - else if (error) - return error; - - mutex_lock(&q->qi_dqlist_lock); - if (recl != q->qi_dqreclaims) { - if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) - break; - - mutex_unlock(&q->qi_dqlist_lock); - goto again; - } - } - - mutex_unlock(&q->qi_dqlist_lock); - return 0; -} - /* * The hash chains and the mplist use the same xfs_dqhash structure as * their list head, but we can take the mplist qh_lock and one of the @@ -1034,18 +846,21 @@ xfs_qm_init_quotainfo( /* * We try to get the limits from the superuser's limits fields. * This is quite hacky, but it is standard quota practice. + * * We look at the USR dquot with id == 0 first, but if user quotas * are not enabled we goto the GRP dquot with id == 0. * We don't really care to keep separate default limits for user * and group quotas, at least not at this point. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. */ - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0, - XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : - (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : - XFS_DQ_PROJ), - XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN, - &dqp); - if (! error) { + error = xfs_qm_dqread(mp, 0, + XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : + (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : + XFS_DQ_PROJ), + XFS_QMOPT_DOWARN, &dqp); + if (!error) { xfs_disk_dquot_t *ddqp = &dqp->q_core; /* @@ -1072,11 +887,6 @@ xfs_qm_init_quotainfo( qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - /* - * We sent the XFS_QMOPT_DQSUSER flag to dqget because - * we don't want this dquot cached. We haven't done a - * quotacheck yet, and quotacheck doesn't like incore dquots. - */ xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -1661,7 +1471,7 @@ xfs_qm_quotacheck( * successfully. */ if (!error) - error = xfs_qm_dqflush_all(mp, 0); + error = xfs_qm_dqflush_all(mp); /* * We can get this error if we couldn't do a dquot allocation inside @@ -1793,59 +1603,33 @@ xfs_qm_init_quotainos( /* - * Just pop the least recently used dquot off the freelist and - * recycle it. The returned dquot is locked. + * Pop the least recently used dquot off the freelist and recycle it. */ -STATIC xfs_dquot_t * +STATIC struct xfs_dquot * xfs_qm_dqreclaim_one(void) { - xfs_dquot_t *dqpout; - xfs_dquot_t *dqp; - int restarts; - int startagain; - - restarts = 0; - dqpout = NULL; + struct xfs_dquot *dqp; + int restarts = 0; - /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ -again: - startagain = 0; mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - +restart: list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { struct xfs_mount *mp = dqp->q_mount; - xfs_dqlock(dqp); + + if (!xfs_dqlock_nowait(dqp)) + continue; /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. We release the - * freelist lock and start over, so that lookup will grab - * both the dquot and the freelistlock. + * This dquot has already been grabbed by dqlookup. + * Remove it from the freelist and try again. */ - if (dqp->dq_flags & XFS_DQ_WANT) { - ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - + if (dqp->q_nrefs) { trace_xfs_dqreclaim_want(dqp); XQM_STATS_INC(xqmstats.xs_qm_dqwants); - restarts++; - startagain = 1; - goto dqunlock; - } - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(mp == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(list_empty(&dqp->q_hashlist)); - ASSERT(list_empty(&dqp->q_mplist)); list_del_init(&dqp->q_freelist); xfs_Gqm->qm_dqfrlist_cnt--; - dqpout = dqp; - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); + restarts++; goto dqunlock; } @@ -1874,64 +1658,49 @@ again: * We flush it delayed write, so don't bother * releasing the freelist lock. */ - error = xfs_qm_dqflush(dqp, 0); + error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); } goto dqunlock; } + xfs_dqfunlock(dqp); /* - * We're trying to get the hashlock out of order. This races - * with dqlookup; so, we giveup and goto the next dquot if - * we couldn't get the hashlock. This way, we won't starve - * a dqlookup process that holds the hashlock that is - * waiting for the freelist lock. + * Prevent lookup now that we are going to reclaim the dquot. + * Once XFS_DQ_FREEING is set lookup won't touch the dquot, + * thus we can drop the lock now. */ - if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - restarts++; - goto dqfunlock; - } + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); - /* - * This races with dquot allocation code as well as dqflush_all - * and reclaim code. So, if we failed to grab the mplist lock, - * giveup everything and start over. - */ - if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { - restarts++; - startagain = 1; - goto qhunlock; - } + mutex_lock(&dqp->q_hash->qh_lock); + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + mutex_unlock(&dqp->q_hash->qh_lock); - ASSERT(dqp->q_nrefs == 0); + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); list_del_init(&dqp->q_mplist); mp->m_quotainfo->qi_dquots--; mp->m_quotainfo->qi_dqreclaims++; - list_del_init(&dqp->q_hashlist); - dqp->q_hash->qh_version++; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + + ASSERT(dqp->q_nrefs == 0); list_del_init(&dqp->q_freelist); xfs_Gqm->qm_dqfrlist_cnt--; - dqpout = dqp; - mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); -qhunlock: - mutex_unlock(&dqp->q_hash->qh_lock); -dqfunlock: - xfs_dqfunlock(dqp); + + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + return dqp; dqunlock: xfs_dqunlock(dqp); - if (dqpout) - break; if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) break; - if (startagain) { - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - goto again; - } + goto restart; } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - return dqpout; + return NULL; } /* @@ -2151,10 +1920,7 @@ xfs_qm_vop_dqalloc( * this to caller */ ASSERT(ip->i_udquot); - uq = ip->i_udquot; - xfs_dqlock(uq); - XFS_DQHOLD(uq); - xfs_dqunlock(uq); + uq = xfs_qm_dqhold(ip->i_udquot); } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { @@ -2175,10 +1941,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); + gq = xfs_qm_dqhold(ip->i_gdquot); } } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { @@ -2198,10 +1961,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); + gq = xfs_qm_dqhold(ip->i_gdquot); } } if (uq) @@ -2251,14 +2011,10 @@ xfs_qm_vop_chown( xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* - * Take an extra reference, because the inode - * is going to keep this dquot pointer even - * after the trans_commit. + * Take an extra reference, because the inode is going to keep + * this dquot pointer even after the trans_commit. */ - xfs_dqlock(newdq); - XFS_DQHOLD(newdq); - xfs_dqunlock(newdq); - *IO_olddq = newdq; + *IO_olddq = xfs_qm_dqhold(newdq); return prevdq; } @@ -2390,25 +2146,21 @@ xfs_qm_vop_create_dqattach( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if (udqp) { - xfs_dqlock(udqp); - XFS_DQHOLD(udqp); - xfs_dqunlock(udqp); ASSERT(ip->i_udquot == NULL); - ip->i_udquot = udqp; ASSERT(XFS_IS_UQUOTA_ON(mp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + + ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp) { - xfs_dqlock(gdqp); - XFS_DQHOLD(gdqp); - xfs_dqunlock(gdqp); ASSERT(ip->i_gdquot == NULL); - ip->i_gdquot = gdqp; ASSERT(XFS_IS_OQUOTA_ON(mp)); ASSERT((XFS_IS_GQUOTA_ON(mp) ? ip->i_d.di_gid : xfs_get_projid(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + + ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 43b9abe1052..9b4f3adefbc 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -33,12 +33,6 @@ extern kmem_zone_t *qm_dqzone; extern kmem_zone_t *qm_dqtrxzone; /* - * Used in xfs_qm_sync called by xfs_sync to count the max times that it can - * iterate over the mountpt's dquot list in one call. - */ -#define XFS_QM_SYNC_MAX_RESTARTS 7 - -/* * Ditto, for xfs_qm_dqreclaim_one. */ #define XFS_QM_RECLAIM_MAX_RESTARTS 4 diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5cc3dde1bc9..eafbcff81f3 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -31,6 +31,7 @@ #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_itable.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, 0); + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); goto out_unlock; } + ASSERT(ip->i_d.di_nextents == 0); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index a595f29567f..8a0807e0f97 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -87,8 +87,7 @@ typedef struct xfs_dqblk { #define XFS_DQ_PROJ 0x0002 /* project quota */ #define XFS_DQ_GROUP 0x0004 /* a group quota */ #define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */ -#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */ +#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ #define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) @@ -97,8 +96,7 @@ typedef struct xfs_dqblk { { XFS_DQ_PROJ, "PROJ" }, \ { XFS_DQ_GROUP, "GROUP" }, \ { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_WANT, "WANT" }, \ - { XFS_DQ_INACTIVE, "INACTIVE" } + { XFS_DQ_FREEING, "FREEING" } /* * In the worst case, when both user and group quotas are on, @@ -199,7 +197,6 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ @@ -326,7 +323,6 @@ extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); extern void xfs_qm_dqdetach(struct xfs_inode *); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); -extern int xfs_qm_sync(struct xfs_mount *, int); extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); @@ -366,10 +362,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) #define xfs_qm_statvfs(ip, s) -static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) -{ - return 0; -} #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8a899496fd5..ee5b695c99a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -199,7 +199,6 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_BARRIER; mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - mp->m_flags |= XFS_MOUNT_DELAYLOG; /* * These can be overridden by the mount option parsing. @@ -353,11 +352,11 @@ xfs_parseargs( mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { - mp->m_flags |= XFS_MOUNT_DELAYLOG; + xfs_warn(mp, + "delaylog is the default now, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { - mp->m_flags &= ~XFS_MOUNT_DELAYLOG; xfs_warn(mp, - "nodelaylog is deprecated and will be removed in Linux 3.3"); + "nodelaylog support has been removed, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { @@ -395,13 +394,6 @@ xfs_parseargs( return EINVAL; } - if ((mp->m_flags & XFS_MOUNT_DISCARD) && - !(mp->m_flags & XFS_MOUNT_DELAYLOG)) { - xfs_warn(mp, - "the discard option is incompatible with the nodelaylog option"); - return EINVAL; - } - #ifndef CONFIG_XFS_QUOTA if (XFS_IS_QUOTA_RUNNING(mp)) { xfs_warn(mp, "quota support not available in this kernel."); @@ -501,7 +493,6 @@ xfs_showargs( { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, - { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { 0, NULL } }; @@ -837,14 +828,6 @@ xfs_fs_inode_init_once( /* xfs inode */ atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - init_waitqueue_head(&ip->i_ipin_wait); - /* - * Because we want to use a counting completion, complete - * the flush completion once to allow a single access to - * the flush completion without blocking. - */ - init_completion(&ip->i_flush); - complete(&ip->i_flush); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); @@ -1014,17 +997,10 @@ xfs_fs_sync_fs( int error; /* - * Not much we can do for the first async pass. Writing out the - * superblock would be counter-productive as we are going to redirty - * when writing out other data and metadata (and writing out a single - * block is quite fast anyway). - * - * Try to asynchronously kick off quota syncing at least. + * Doing anything during the async pass would be counterproductive. */ - if (!wait) { - xfs_qm_sync(mp, SYNC_TRYLOCK); + if (!wait) return 0; - } error = xfs_quiesce_data(mp); if (error) @@ -1238,9 +1214,9 @@ xfs_fs_unfreeze( STATIC int xfs_fs_show_options( struct seq_file *m, - struct vfsmount *mnt) + struct dentry *root) { - return -xfs_showargs(XFS_M(mnt->mnt_sb), m); + return -xfs_showargs(XFS_M(root->d_sb), m); } /* @@ -1621,12 +1597,12 @@ STATIC int __init xfs_init_workqueues(void) { /* - * max_active is set to 8 to give enough concurency to allow - * multiple work operations on each CPU to run. This allows multiple - * filesystems to be running sync work concurrently, and scales with - * the number of CPUs in the system. + * We never want to the same work item to run twice, reclaiming inodes + * or idling the log is not going to get any faster by multiple CPUs + * competing for ressources. Use the default large max_active value + * so that even lots of filesystems can perform these task in parallel. */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); + xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); if (!xfs_syncd_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index f0994aedcd1..40b75eecd2b 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -395,10 +395,7 @@ xfs_quiesce_data( */ xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); - xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_qm_sync(mp, SYNC_WAIT); - - /* force out the newly dirtied log buffers */ + /* force out the log */ xfs_log_force(mp, XFS_LOG_SYNC); /* write superblock and hoover up shutdown errors */ @@ -506,7 +503,6 @@ xfs_sync_worker( error = xfs_fs_log_dummy(mp); else xfs_log_force(mp, 0); - error = xfs_qm_sync(mp, SYNC_TRYLOCK); /* start pushing all the metadata that is currently dirty */ xfs_ail_push_all(mp->m_ail); @@ -711,14 +707,13 @@ xfs_reclaim_inode_grab( return 1; /* - * do some unlocked checks first to avoid unnecessary lock traffic. - * The first is a flush lock check, the second is a already in reclaim - * check. Only do these checks if we are not going to block on locks. + * If we are asked for non-blocking operation, do unlocked checks to + * see if the inode already is being flushed or in reclaim to avoid + * lock traffic. */ if ((flags & SYNC_TRYLOCK) && - (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { + __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) return 1; - } /* * The radix tree lock here protects a thread in xfs_iget from racing diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 49403579887..6b6df5802e9 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -743,8 +743,6 @@ DEFINE_DQUOT_EVENT(xfs_dqtobp_read); DEFINE_DQUOT_EVENT(xfs_dqread); DEFINE_DQUOT_EVENT(xfs_dqread_fail); DEFINE_DQUOT_EVENT(xfs_dqlookup_found); -DEFINE_DQUOT_EVENT(xfs_dqlookup_want); -DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); DEFINE_DQUOT_EVENT(xfs_dqlookup_done); DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); @@ -893,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) - __field(xfs_fsize_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, flags) @@ -902,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx size 0x%llx " "offset 0x%llx count 0x%zx ioflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->new_size, __entry->offset, __entry->count, __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) @@ -980,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) - __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, type) @@ -992,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->type = type; @@ -1000,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " - "offset 0x%llx count %zd type %s " - "startoff 0x%llx startblock %lld blockcount 0x%llx", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " + "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->new_size, __entry->offset, __entry->count, __print_symbolic(__entry->type, XFS_IO_TYPES), @@ -1033,26 +1024,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __field(xfs_ino_t, ino) __field(loff_t, isize) __field(loff_t, disize) - __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->isize = ip->i_size; + __entry->isize = VFS_I(ip)->i_size; __entry->disize = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; ), - TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " "offset 0x%llx count %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->disize, - __entry->new_size, __entry->offset, __entry->count) ); @@ -1092,8 +1080,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, DEFINE_EVENT(xfs_itrunc_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ TP_ARGS(ip, new_size)) -DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); -DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); TRACE_EVENT(xfs_pagecache_inval, TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), @@ -1570,7 +1558,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __field(xfs_ino_t, ino) __field(int, format) __field(int, nex) - __field(int, max_nex) __field(int, broot_size) __field(int, fork_off) ), @@ -1580,18 +1567,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->ino = ip->i_ino; __entry->format = ip->i_d.di_format; __entry->nex = ip->i_d.di_nextents; - __entry->max_nex = ip->i_df.if_ext_max; __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " - "Max in-fork extents %d, broot size %d, fork offset %d", + "broot size %d, fork offset %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), __entry->nex, - __entry->max_nex, __entry->broot_size, __entry->fork_off) ) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 1f35b2feca9..329b06aba1c 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1158,7 +1158,6 @@ xfs_trans_add_item( lidp->lid_item = lip; lidp->lid_flags = 0; - lidp->lid_size = 0; list_add_tail(&lidp->lid_trans, &tp->t_items); lip->li_desc = lidp; @@ -1210,219 +1209,6 @@ xfs_trans_free_items( } } -/* - * Unlock the items associated with a transaction. - * - * Items which were not logged should be freed. Those which were logged must - * still be tracked so they can be unpinned when the transaction commits. - */ -STATIC void -xfs_trans_unlock_items( - struct xfs_trans *tp, - xfs_lsn_t commit_lsn) -{ - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; - - lip->li_desc = NULL; - - if (commit_lsn != NULLCOMMITLSN) - IOP_COMMITTING(lip, commit_lsn); - IOP_UNLOCK(lip); - - /* - * Free the descriptor if the item is not dirty - * within this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - xfs_trans_free_item_desc(lidp); - } -} - -/* - * Total up the number of log iovecs needed to commit this - * transaction. The transaction itself needs one for the - * transaction header. Ask each dirty item in turn how many - * it needs to get the total. - */ -static uint -xfs_trans_count_vecs( - struct xfs_trans *tp) -{ - int nvecs; - struct xfs_log_item_desc *lidp; - - nvecs = 1; - - /* In the non-debug case we need to start bailing out if we - * didn't find a log_item here, return zero and let trans_commit - * deal with it. - */ - if (list_empty(&tp->t_items)) { - ASSERT(0); - return 0; - } - - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - lidp->lid_size = IOP_SIZE(lidp->lid_item); - nvecs += lidp->lid_size; - } - - return nvecs; -} - -/* - * Fill in the vector with pointers to data to be logged - * by this transaction. The transaction header takes - * the first vector, and then each dirty item takes the - * number of vectors it indicated it needed in xfs_trans_count_vecs(). - * - * As each item fills in the entries it needs, also pin the item - * so that it cannot be flushed out until the log write completes. - */ -static void -xfs_trans_fill_vecs( - struct xfs_trans *tp, - struct xfs_log_iovec *log_vector) -{ - struct xfs_log_item_desc *lidp; - struct xfs_log_iovec *vecp; - uint nitems; - - /* - * Skip over the entry for the transaction header, we'll - * fill that in at the end. - */ - vecp = log_vector + 1; - - nitems = 0; - ASSERT(!list_empty(&tp->t_items)); - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - - /* - * The item may be marked dirty but not log anything. This can - * be used to get called when a transaction is committed. - */ - if (lidp->lid_size) - nitems++; - IOP_FORMAT(lidp->lid_item, vecp); - vecp += lidp->lid_size; - IOP_PIN(lidp->lid_item); - } - - /* - * Now that we've counted the number of items in this transaction, fill - * in the transaction header. Note that the transaction header does not - * have a log item. - */ - tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_header.th_type = tp->t_type; - tp->t_header.th_num_items = nitems; - log_vector->i_addr = (xfs_caddr_t)&tp->t_header; - log_vector->i_len = sizeof(xfs_trans_header_t); - log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; -} - -/* - * The committed item processing consists of calling the committed routine of - * each logged item, updating the item's position in the AIL if necessary, and - * unpinning each item. If the committed routine returns -1, then do nothing - * further with the item because it may have been freed. - * - * Since items are unlocked when they are copied to the incore log, it is - * possible for two transactions to be completing and manipulating the same - * item simultaneously. The AIL lock will protect the lsn field of each item. - * The value of this field can never go backwards. - * - * We unpin the items after repositioning them in the AIL, because otherwise - * they could be immediately flushed and we'd have to race with the flusher - * trying to pull the item from the AIL as we add it. - */ -static void -xfs_trans_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t commit_lsn, - int aborted) -{ - xfs_lsn_t item_lsn; - struct xfs_ail *ailp; - - if (aborted) - lip->li_flags |= XFS_LI_ABORTED; - item_lsn = IOP_COMMITTED(lip, commit_lsn); - - /* item_lsn of -1 means the item needs no further processing */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) - return; - - /* - * If the returned lsn is greater than what it contained before, update - * the location of the item in the AIL. If it is not, then do nothing. - * Items can never move backwards in the AIL. - * - * While the new lsn should usually be greater, it is possible that a - * later transaction completing simultaneously with an earlier one - * using the same item could complete first with a higher lsn. This - * would cause the earlier transaction to fail the test below. - */ - ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { - /* - * This will set the item's lsn to item_lsn and update the - * position of the item in the AIL. - * - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_update(ailp, lip, item_lsn); - } else { - spin_unlock(&ailp->xa_lock); - } - - /* - * Now that we've repositioned the item in the AIL, unpin it so it can - * be flushed. Pass information about buffer stale state down from the - * log item flags, if anyone else stales the buffer we do not want to - * pay any attention to it. - */ - IOP_UNPIN(lip, 0); -} - -/* - * This is typically called by the LM when a transaction has been fully - * committed to disk. It needs to unpin the items which have - * been logged by the transaction and update their positions - * in the AIL if necessary. - * - * This also gets called when the transactions didn't get written out - * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. - */ -STATIC void -xfs_trans_committed( - void *arg, - int abortflag) -{ - struct xfs_trans *tp = arg; - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); - xfs_trans_free_item_desc(lidp); - } - - xfs_trans_free(tp); -} - static inline void xfs_log_item_batch_insert( struct xfs_ail *ailp, @@ -1538,258 +1324,6 @@ xfs_trans_committed_bulk( } /* - * Called from the trans_commit code when we notice that the filesystem is in - * the middle of a forced shutdown. - * - * When we are called here, we have already pinned all the items in the - * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called - * so we can simply walk the items in the transaction, unpin them with an abort - * flag and then free the items. Note that unpinning the items can result in - * them being freed immediately, so we need to use a safe list traversal method - * here. - */ -STATIC void -xfs_trans_uncommit( - struct xfs_trans *tp, - uint flags) -{ - struct xfs_log_item_desc *lidp, *n; - - list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) { - if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN(lidp->lid_item, 1); - } - - xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); - - xfs_trans_free_items(tp, NULLCOMMITLSN, flags); - xfs_trans_free(tp); -} - -/* - * Format the transaction direct to the iclog. This isolates the physical - * transaction commit operation from the logical operation and hence allows - * other methods to be introduced without affecting the existing commit path. - */ -static int -xfs_trans_commit_iclog( - struct xfs_mount *mp, - struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, - int flags) -{ - int shutdown; - int error; - int log_flags = 0; - struct xlog_in_core *commit_iclog; -#define XFS_TRANS_LOGVEC_COUNT 16 - struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - struct xfs_log_iovec *log_vector; - uint nvec; - - - /* - * Ask each log item how many log_vector entries it will - * need so we can figure out how many to allocate. - * Try to avoid the kmem_alloc() call in the common case - * by using a vector from the stack when it fits. - */ - nvec = xfs_trans_count_vecs(tp); - if (nvec == 0) { - return ENOMEM; /* triggers a shutdown! */ - } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { - log_vector = log_vector_fast; - } else { - log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec * - sizeof(xfs_log_iovec_t), - KM_SLEEP); - } - - /* - * Fill in the log_vector and pin the logged items, and - * then write the transaction to the log. - */ - xfs_trans_fill_vecs(tp, log_vector); - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; - - error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); - - /* - * The transaction is committed incore here, and can go out to disk - * at any time after this call. However, all the items associated - * with the transaction are still locked and pinned in memory. - */ - *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); - - tp->t_commit_lsn = *commit_lsn; - trace_xfs_trans_commit_lsn(tp); - - if (nvec > XFS_TRANS_LOGVEC_COUNT) - kmem_free(log_vector); - - /* - * If we got a log write error. Unpin the logitems that we - * had pinned, clean up, free trans structure, and return error. - */ - if (error || *commit_lsn == -1) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); - return XFS_ERROR(EIO); - } - - /* - * Once the transaction has committed, unused - * reservations need to be released and changes to - * the superblock need to be reflected in the in-core - * version. Do that now. - */ - xfs_trans_unreserve_and_mod_sb(tp); - - /* - * Tell the LM to call the transaction completion routine - * when the log write with LSN commit_lsn completes (e.g. - * when the transaction commit really hits the on-disk log). - * After this call we cannot reference tp, because the call - * can happen at any time and the call will free the transaction - * structure pointed to by tp. The only case where we call - * the completion routine (xfs_trans_committed) directly is - * if the log is turned off on a debug kernel or we're - * running in simulation mode (the log is explicitly turned - * off). - */ - tp->t_logcb.cb_func = xfs_trans_committed; - tp->t_logcb.cb_arg = tp; - - /* - * We need to pass the iclog buffer which was used for the - * transaction commit record into this function, and attach - * the callback to it. The callback must be attached before - * the items are unlocked to avoid racing with other threads - * waiting for an item to unlock. - */ - shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb)); - - /* - * Mark this thread as no longer being in a transaction - */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - - /* - * Once all the items of the transaction have been copied - * to the in core log and the callback is attached, the - * items can be unlocked. - * - * This will free descriptors pointing to items which were - * not logged since there is nothing more to do with them. - * For items which were logged, we will keep pointers to them - * so they can be unpinned after the transaction commits to disk. - * This will also stamp each modified meta-data item with - * the commit lsn of this transaction for dependency tracking - * purposes. - */ - xfs_trans_unlock_items(tp, *commit_lsn); - - /* - * If we detected a log error earlier, finish committing - * the transaction now (unpin log items, etc). - * - * Order is critical here, to avoid using the transaction - * pointer after its been freed (by xfs_trans_committed - * either here now, or as a callback). We cannot do this - * step inside xfs_log_notify as was done earlier because - * of this issue. - */ - if (shutdown) - xfs_trans_committed(tp, XFS_LI_ABORTED); - - /* - * Now that the xfs_trans_committed callback has been attached, - * and the items are released we can finally allow the iclog to - * go to disk. - */ - return xfs_log_release_iclog(mp, commit_iclog); -} - -/* - * Walk the log items and allocate log vector structures for - * each item large enough to fit all the vectors they require. - * Note that this format differs from the old log vector format in - * that there is no transaction header in these log vectors. - */ -STATIC struct xfs_log_vec * -xfs_trans_alloc_log_vecs( - xfs_trans_t *tp) -{ - struct xfs_log_item_desc *lidp; - struct xfs_log_vec *lv = NULL; - struct xfs_log_vec *ret_lv = NULL; - - - /* Bail out if we didn't find a log item. */ - if (list_empty(&tp->t_items)) { - ASSERT(0); - return NULL; - } - - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_vec *new_lv; - - /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - - /* Skip items that do not have any vectors for writing */ - lidp->lid_size = IOP_SIZE(lidp->lid_item); - if (!lidp->lid_size) - continue; - - new_lv = kmem_zalloc(sizeof(*new_lv) + - lidp->lid_size * sizeof(struct xfs_log_iovec), - KM_SLEEP); - - /* The allocated iovec region lies beyond the log vector. */ - new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; - new_lv->lv_niovecs = lidp->lid_size; - new_lv->lv_item = lidp->lid_item; - if (!ret_lv) - ret_lv = new_lv; - else - lv->lv_next = new_lv; - lv = new_lv; - } - - return ret_lv; -} - -static int -xfs_trans_commit_cil( - struct xfs_mount *mp, - struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, - int flags) -{ - struct xfs_log_vec *log_vector; - - /* - * Get each log item to allocate a vector structure for - * the log item to to pass to the log write code. The - * CIL commit code will format the vector and save it away. - */ - log_vector = xfs_trans_alloc_log_vecs(tp); - if (!log_vector) - return ENOMEM; - - xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); - - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free(tp); - return 0; -} - -/* * Commit the given transaction to the log. * * XFS disk error handling mechanism is not based on a typical @@ -1845,17 +1379,16 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - if (mp->m_flags & XFS_MOUNT_DELAYLOG) - error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); - else - error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); - + error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags); if (error == ENOMEM) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); error = XFS_ERROR(EIO); goto out_unreserve; } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free(tp); + /* * If the transaction needs to be synchronous, then force the * log out now and wait for it. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 3ae713c0abd..f6118703f20 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -163,9 +163,8 @@ typedef struct xfs_trans_header { */ struct xfs_log_item_desc { struct xfs_log_item *lid_item; - ushort lid_size; - unsigned char lid_flags; struct list_head lid_trans; + unsigned char lid_flags; }; #define XFS_LID_DIRTY 0x1 diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 8b32d1a4c5a..89dbb4a5087 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -53,7 +53,7 @@ xfs_dir_ialloc( output: may be a new transaction. */ xfs_inode_t *dp, /* directory within whose allocate the inode. */ - mode_t mode, + umode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, prid_t prid, /* project id */ diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index 456fca31493..5eeab4690cf 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -18,7 +18,7 @@ #ifndef __XFS_UTILS_H__ #define __XFS_UTILS_H__ -extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, +extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, xfs_inode_t **, int *); extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index ce9268a2f56..ebdb88840a4 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -131,7 +131,8 @@ xfs_readlink( __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - return XFS_ERROR(EFSCORRUPTED); + error = XFS_ERROR(EFSCORRUPTED); + goto out; } @@ -175,7 +176,7 @@ xfs_free_eofblocks( * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. */ - end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); if (last_fsb <= end_fsb) return 0; @@ -226,7 +227,14 @@ xfs_free_eofblocks( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, ip->i_size); + /* + * Do not update the on-disk file size. If we update the + * on-disk file size and then the system crashes before the + * contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip)); if (error) { /* * If we get an error at this point we simply don't @@ -540,8 +548,8 @@ xfs_release( return 0; if ((S_ISREG(ip->i_d.di_mode) && - ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || - ip->i_delayed_blks > 0)) && + (VFS_I(ip)->i_size > 0 || + (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { @@ -618,7 +626,7 @@ xfs_inactive( * only one with a reference to the inode. */ truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || (ip->i_size != 0) || + ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 || (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && S_ISREG(ip->i_d.di_mode)); @@ -632,12 +640,12 @@ xfs_inactive( if (ip->i_d.di_nlink != 0) { if ((S_ISREG(ip->i_d.di_mode) && - ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || - ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS) && - (!(ip->i_d.di_flags & + (VFS_I(ip)->i_size > 0 || + (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || - (ip->i_delayed_blks != 0)))) { + ip->i_delayed_blks != 0))) { error = xfs_free_eofblocks(mp, ip, 0); if (error) return VN_INACTIVE_CACHE; @@ -670,13 +678,18 @@ xfs_inactive( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, 0); + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; } + + ASSERT(ip->i_d.di_nextents == 0); } else if (S_ISLNK(ip->i_d.di_mode)) { /* @@ -822,7 +835,7 @@ int xfs_create( xfs_inode_t *dp, struct xfs_name *name, - mode_t mode, + umode_t mode, xfs_dev_t rdev, xfs_inode_t **ipp) { @@ -1481,7 +1494,7 @@ xfs_symlink( xfs_inode_t *dp, struct xfs_name *link_name, const char *target_path, - mode_t mode, + umode_t mode, xfs_inode_t **ipp) { xfs_mount_t *mp = dp->i_mount; @@ -1961,11 +1974,11 @@ xfs_zero_remaining_bytes( * since nothing can read beyond eof. The space will * be zeroed when the file is extended anyway. */ - if (startoff >= ip->i_size) + if (startoff >= XFS_ISIZE(ip)) return 0; - if (endoff > ip->i_size) - endoff = ip->i_size; + if (endoff > XFS_ISIZE(ip)) + endoff = XFS_ISIZE(ip); bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, @@ -2260,7 +2273,7 @@ xfs_change_file_space( bf->l_start += offset; break; case 2: /*SEEK_END*/ - bf->l_start += ip->i_size; + bf->l_start += XFS_ISIZE(ip); break; default: return XFS_ERROR(EINVAL); @@ -2277,7 +2290,7 @@ xfs_change_file_space( bf->l_whence = 0; startoffset = bf->l_start; - fsize = ip->i_size; + fsize = XFS_ISIZE(ip); /* * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 35d3d513e1e..0c877cbde14 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -26,7 +26,7 @@ int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); @@ -35,7 +35,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, - const char *target_path, mode_t mode, struct xfs_inode **ipp); + const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); int xfs_change_file_space(struct xfs_inode *ip, int cmd, xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); |