From ee1a47ab0e77600fcbdf1c87d461bd8f3f63150d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 21 Apr 2013 14:53:46 -0500 Subject: xfs: add support for large btree blocks Add support for larger btree blocks that contains a CRC32C checksum, a filesystem uuid and block number for detecting filesystem consistency and out of place writes. [dchinner@redhat.com] Also include an owner field to allow reverse mappings to be implemented for improved repairability and a LSN field to so that log recovery can easily determine the last modification that made it to disk for each buffer. [dchinner@redhat.com] Add buffer log format flags to indicate the type of buffer to recovery so that we don't have to do blind magic number tests to determine what the buffer is. [dchinner@redhat.com] Modified to fit into the verifier structure. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4f201656d2d..202ce37e66c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -786,6 +786,7 @@ xfs_iformat_btree( xfs_dinode_t *dip, int whichfork) { + struct xfs_mount *mp = ip->i_mount; xfs_bmdr_block_t *dfp; xfs_ifork_t *ifp; /* REFERENCED */ @@ -794,7 +795,7 @@ xfs_iformat_btree( ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(dfp); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); /* @@ -805,14 +806,14 @@ xfs_iformat_btree( * blocks. */ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_IFORK_MAXEXT(ip, whichfork) || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || + XFS_DFORK_SIZE(dip, mp, whichfork) || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", - (unsigned long long) ip->i_ino); + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + mp, dip); return XFS_ERROR(EFSCORRUPTED); } @@ -823,8 +824,7 @@ xfs_iformat_btree( * Copy and convert from the on-disk structure * to the in-memory structure. */ - xfs_bmdr_to_bmbt(ip->i_mount, dfp, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), ifp->if_broot, size); ifp->if_flags &= ~XFS_IFEXTENTS; ifp->if_flags |= XFS_IFBROOT; @@ -2037,7 +2037,7 @@ xfs_iroot_realloc( * allocate it now and get out. */ if (ifp->if_broot_bytes == 0) { - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; @@ -2051,9 +2051,9 @@ xfs_iroot_realloc( */ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); @@ -2061,7 +2061,7 @@ xfs_iroot_realloc( (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); return; } @@ -2076,7 +2076,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; ASSERT(new_max >= 0); if (new_max > 0) - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); else new_size = 0; if (new_size > 0) { @@ -2084,7 +2084,8 @@ xfs_iroot_realloc( /* * First copy over the btree block header. */ - memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); } else { new_broot = NULL; ifp->if_flags &= ~XFS_IFBROOT; @@ -2114,7 +2115,7 @@ xfs_iroot_realloc( ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); return; } @@ -2427,7 +2428,7 @@ xfs_iflush_fork( ASSERT(ifp->if_broot != NULL); ASSERT(ifp->if_broot_bytes <= (XFS_IFORK_SIZE(ip, whichfork) + - XFS_BROOT_SIZE_ADJ)); + XFS_BROOT_SIZE_ADJ(ip))); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); -- cgit v1.2.3-70-g09d2 From 93848a999cf9b9e4f4f77dba843a48c393f33c59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Apr 2013 16:11:17 +1100 Subject: xfs: add version 3 inode format with CRCs Add a new inode version with a larger core. The primary objective is to allow for a crc of the inode, and location information (uuid and ino) to verify it was written in the right place. We also extend it by: a creation time (for Samba); a changecount (for NFSv4); a flush sequence (in LSN format for recovery); an additional inode flags field; and some additional padding. These additional fields are not implemented yet, but already laid out in the structure. [dchinner@redhat.com] Added LSN and flags field, some factoring and rework to capture all the necessary information in the crc calculation. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.h | 4 +- fs/xfs/xfs_dinode.h | 33 +++++++-- fs/xfs/xfs_ialloc.c | 50 ++++++++++--- fs/xfs/xfs_inode.c | 179 +++++++++++++++++++++++++++++++++-------------- fs/xfs/xfs_inode.h | 26 +++++++ fs/xfs/xfs_inode_item.c | 2 +- fs/xfs/xfs_log_recover.c | 32 +++++++-- fs/xfs/xfs_trans_buf.c | 5 +- 8 files changed, 254 insertions(+), 77 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index c25660691e0..abae8c8c4ec 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -48,6 +48,7 @@ extern kmem_zone_t *xfs_buf_item_zone; #define XFS_BLF_AGF_BUF (1<<6) #define XFS_BLF_AGFL_BUF (1<<7) #define XFS_BLF_AGI_BUF (1<<8) +#define XFS_BLF_DINO_BUF (1<<9) #define XFS_BLF_TYPE_MASK \ (XFS_BLF_UDQUOT_BUF | \ @@ -56,7 +57,8 @@ extern kmem_zone_t *xfs_buf_item_zone; XFS_BLF_BTREE_BUF | \ XFS_BLF_AGF_BUF | \ XFS_BLF_AGFL_BUF | \ - XFS_BLF_AGI_BUF) + XFS_BLF_AGI_BUF | \ + XFS_BLF_DINO_BUF) #define XFS_BLF_CHUNK 128 #define XFS_BLF_SHIFT 7 diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 6b5bd1745db..f7a0e95d197 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -19,7 +19,7 @@ #define __XFS_DINODE_H__ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2)) +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) typedef struct xfs_timestamp { __be32 t_sec; /* timestamp seconds */ @@ -70,10 +70,35 @@ typedef struct xfs_dinode { /* di_next_unlinked is the only non-core field in the old dinode */ __be32 di_next_unlinked;/* agi unlinked list ptr */ -} __attribute__((packed)) xfs_dinode_t; + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; #define DI_MAX_FLUSH 0xffff +/* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} + /* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. * Since the pathconf interface is signed, we use 2^31 - 1 instead. @@ -105,7 +130,7 @@ typedef enum xfs_dinode_fmt { * Inode size for given fs. */ #define XFS_LITINO(mp, version) \ - ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode))) + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) #define XFS_BROOT_SIZE_ADJ(ip) \ (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t)) @@ -133,7 +158,7 @@ typedef enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)(dip) + sizeof(struct xfs_dinode)) + ((char *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 6d0a4954aa8..3039f829c96 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -167,6 +167,7 @@ xfs_ialloc_inode_init( int version; int i, j; xfs_daddr_t d; + xfs_ino_t ino = 0; /* * Loop over the new block(s), filling in the inodes. @@ -185,13 +186,29 @@ xfs_ialloc_inode_init( } /* - * Figure out what version number to use in the inodes we create. - * If the superblock version has caught up to the one that supports - * the new inode format, then use the new inode version. Otherwise - * use the old version so that old kernels will continue to be - * able to use the file system. + * Figure out what version number to use in the inodes we create. If + * the superblock version has caught up to the one that supports the new + * inode format, then use the new inode version. Otherwise use the old + * version so that old kernels will continue to be able to use the file + * system. + * + * For v3 inodes, we also need to write the inode number into the inode, + * so calculate the first inode number of the chunk here as + * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * across multiple filesystem blocks (such as a cluster) and so cannot + * be used in the cluster buffer loop below. + * + * Further, because we are writing the inode directly into the buffer + * and calculating a CRC on the entire inode, we have ot log the entire + * inode so that the entire range the CRC covers is present in the log. + * That means for v3 inode we log the entire buffer rather than just the + * inode cores. */ - if (xfs_sb_version_hasnlink(&mp->m_sb)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { + version = 3; + ino = XFS_AGINO_TO_INO(mp, agno, + XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + } else if (xfs_sb_version_hasnlink(&mp->m_sb)) version = 2; else version = 1; @@ -214,17 +231,32 @@ xfs_ialloc_inode_init( * individual transactions causing a lot of log traffic. */ fbuf->b_ops = &xfs_inode_buf_ops; - xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); + xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = sizeof(struct xfs_dinode); + uint isize = xfs_dinode_size(version); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); - xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); + + if (version == 3) { + free->di_ino = cpu_to_be64(ino); + ino++; + uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + xfs_dinode_calc_crc(mp, free); + } else { + /* just log the inode core */ + xfs_trans_log_buf(tp, fbuf, ioffset, + ioffset + isize - 1); + } + } + if (version == 3) { + /* need to log the entire buffer */ + xfs_trans_log_buf(tp, fbuf, 0, + BBTOB(fbuf->b_length) - 1); } xfs_trans_inode_alloc_buf(tp, fbuf); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 202ce37e66c..558ef494720 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,6 +44,7 @@ #include "xfs_quota.h" #include "xfs_filestream.h" #include "xfs_vnodeops.h" +#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -866,6 +867,17 @@ xfs_dinode_from_disk( to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); to->di_gen = be32_to_cpu(from->di_gen); + + if (to->di_version == 3) { + to->di_changecount = be64_to_cpu(from->di_changecount); + to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_ino = be64_to_cpu(from->di_ino); + to->di_lsn = be64_to_cpu(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } } void @@ -902,6 +914,17 @@ xfs_dinode_to_disk( to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } } STATIC uint @@ -962,6 +985,47 @@ xfs_dic2xflags( (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } +static bool +xfs_dinode_verify( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) + return false; + + /* only version 3 or greater inodes are extensively verified here */ + if (dip->di_version < 3) + return true; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc))) + return false; + if (be64_to_cpu(dip->di_ino) != ip->i_ino) + return false; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + return false; + return true; +} + +void +xfs_dinode_calc_crc( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + __uint32_t crc; + + if (dip->di_version < 3) + return; + + ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc)); + dip->di_crc = xfs_end_cksum(crc); +} + /* * Read the disk inode attributes into the in-core inode structure. */ @@ -990,17 +1054,13 @@ xfs_iread( if (error) return error; - /* - * If we got something that isn't an inode it means someone - * (nfs or dmi) has a stale handle. - */ - if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { -#ifdef DEBUG - xfs_alert(mp, - "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", - __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); -#endif /* DEBUG */ - error = XFS_ERROR(EINVAL); + /* even unallocated inodes are verified */ + if (!xfs_dinode_verify(mp, ip, dip)) { + xfs_alert(mp, "%s: validation failed for inode %lld failed", + __func__, ip->i_ino); + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + error = XFS_ERROR(EFSCORRUPTED); goto out_brelse; } @@ -1022,10 +1082,20 @@ xfs_iread( goto out_brelse; } } else { + /* + * Partial initialisation of the in-core inode. Just the bits + * that xfs_ialloc won't overwrite or relies on being correct. + */ ip->i_d.di_magic = be16_to_cpu(dip->di_magic); ip->i_d.di_version = dip->di_version; ip->i_d.di_gen = be32_to_cpu(dip->di_gen); ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + + if (dip->di_version == 3) { + ip->i_d.di_ino = be64_to_cpu(dip->di_ino); + uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); + } + /* * Make sure to pull in the mode here as well in * case the inode is released without being used. @@ -1161,6 +1231,7 @@ xfs_ialloc( xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { + struct xfs_mount *mp = tp->t_mountp; xfs_ino_t ino; xfs_inode_t *ip; uint flags; @@ -1187,7 +1258,7 @@ xfs_ialloc( * This is because we're setting fields here we need * to prevent others from looking at until we're done. */ - error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, + error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); if (error) return error; @@ -1208,7 +1279,7 @@ xfs_ialloc( * the inode version number now. This way we only do the conversion * here rather than here and in the flush/logging code. */ - if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && + if (xfs_sb_version_hasnlink(&mp->m_sb) && ip->i_d.di_version == 1) { ip->i_d.di_version = 2; /* @@ -1258,6 +1329,19 @@ xfs_ialloc( ip->i_d.di_dmevmask = 0; ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; + + if (ip->i_d.di_version == 3) { + ASSERT(ip->i_d.di_ino == ino); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ip->i_d.di_crc = 0; + ip->i_d.di_changecount = 1; + ip->i_d.di_lsn = 0; + ip->i_d.di_flags2 = 0; + memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); + ip->i_d.di_crtime = ip->i_d.di_mtime; + } + + flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -2716,20 +2800,18 @@ abort_out: STATIC int xfs_iflush_int( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_inode_log_item_t *iip; - xfs_dinode_t *dip; - xfs_mount_t *mp; + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - iip = ip->i_itemp; - mp = ip->i_mount; + ASSERT(iip != NULL && iip->ili_fields != 0); /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -2790,9 +2872,9 @@ xfs_iflush_int( } /* * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. + * postdate a log record during recovery. This is redundant as we now + * log every change and hence this can't happen. Still, it doesn't hurt. */ - ip->i_d.di_flushiter++; /* @@ -2868,41 +2950,30 @@ xfs_iflush_int( * need the AIL lock, because it is a 64 bit value that cannot be read * atomically. */ - if (iip != NULL && iip->ili_fields != 0) { - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_logged = 1; + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); - /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. - */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + /* + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. + */ + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); - ASSERT(bp->b_fspriv != NULL); - ASSERT(bp->b_iodone != NULL); - } else { - /* - * We're flushing an inode which is not in the AIL and has - * not been logged. For this case we can immediately drop - * the inode flush lock because we can avoid the whole - * AIL state thing. It's OK to drop the flush lock now, - * because we've already locked the buffer and to do anything - * you really need both. - */ - if (iip != NULL) { - ASSERT(iip->ili_logged == 0); - ASSERT(iip->ili_last_fields == 0); - ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); - } - xfs_ifunlock(ip); - } + /* update the lsn in the on disk inode if required */ + if (ip->i_d.di_version == 3) + dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); + + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_iodone != NULL); return 0; corrupt_out: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b8520b5c3a1..91129794aae 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -150,13 +150,38 @@ typedef struct xfs_icdinode { __uint16_t di_dmstate; /* DMIG state info */ __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ __uint32_t di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __uint32_t di_crc; /* CRC of the inode */ + __uint64_t di_changecount; /* number of attribute changes */ + xfs_lsn_t di_lsn; /* flush sequence */ + __uint64_t di_flags2; /* more random flags */ + __uint8_t di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_ictimestamp_t di_crtime; /* time created */ + xfs_ino_t di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ } xfs_icdinode_t; +static inline uint xfs_icdinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_icdinode); + return offsetof(struct xfs_icdinode, di_next_unlinked); +} + /* * Flags for xfs_ichgtime(). */ #define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ #define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ /* * Per-fork incore inode flags. @@ -556,6 +581,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f034bd1652f..f76ff52e43c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -179,7 +179,7 @@ xfs_inode_item_format( nvecs = 1; vecp->i_addr = &ip->i_d; - vecp->i_len = sizeof(struct xfs_icdinode); + vecp->i_len = xfs_icdinode_size(ip->i_d.di_version); vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; nvecs++; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 27b3ec214a6..287878219af 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1786,6 +1786,7 @@ xlog_recover_do_inode_buffer( xfs_agino_t *buffer_nextp; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { @@ -1989,6 +1990,18 @@ xlog_recover_do_reg_buffer( } bp->b_ops = &xfs_dquot_buf_ops; break; + case XFS_BLF_DINO_BUF: + /* + * we get here with inode allocation buffers, not buffers that + * track unlinked list changes. + */ + if (*(__be16 *)bp->b_addr != cpu_to_be16(XFS_DINODE_MAGIC)) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; default: break; } @@ -2277,6 +2290,7 @@ xlog_recover_inode_pass2( int attr_index; uint fields; xfs_icdinode_t *dicp; + uint isize; int need_free = 0; if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { @@ -2302,7 +2316,7 @@ xlog_recover_inode_pass2( trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, - NULL); + &xfs_inode_buf_ops); if (!bp) { error = ENOMEM; goto error; @@ -2413,7 +2427,8 @@ xlog_recover_inode_pass2( error = EFSCORRUPTED; goto error; } - if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); @@ -2425,13 +2440,13 @@ xlog_recover_inode_pass2( } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { - memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; @@ -2515,6 +2530,9 @@ xlog_recover_inode_pass2( } write_inode_buffer: + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index f950edd0d53..8a0f6af5120 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -704,12 +704,13 @@ xfs_trans_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLF_DINO_BUF); } /* * This call is used to indicate that the buffer is going to * be staled and was an inode buffer. This means it gets - * special processing during unpin - where any inodes + * special processing during unpin - where any inodes * associated with the buffer should be removed from ail. * There is also special processing during recovery, * any replay of the inodes in the buffer needs to be @@ -728,6 +729,7 @@ xfs_trans_stale_inode_buf( bip->bli_flags |= XFS_BLI_STALE_INODE; bip->bli_item.li_cb = xfs_buf_iodone; + xfs_trans_buf_set_type(tp, bp, XFS_BLF_DINO_BUF); } /* @@ -751,6 +753,7 @@ xfs_trans_inode_alloc_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLF_DINO_BUF); } /* -- cgit v1.2.3-70-g09d2 From 742ae1e35b038ed65ddd86182723441ea74db765 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 30 Apr 2013 21:39:34 +1000 Subject: xfs: introduce CONFIG_XFS_WARN Running a CONFIG_XFS_DEBUG kernel in production environments is not the best idea as it introduces significant overhead, can change the behaviour of algorithms (such as allocation) to improve test coverage, and (most importantly) panic the machine on non-fatal errors. There are many cases where all we want to do is run a kernel with more bounds checking enabled, such as is provided by the ASSERT() statements throughout the code, but without all the potential overhead and drawbacks. This patch converts all the ASSERT statements to evaluate as WARN_ON(1) statements and hence if they fail dump a warning and a stack trace to the log. This has minimal overhead and does not change any algorithms, and will allow us to find strange "out of bounds" problems more easily on production machines. There are a few places where assert statements contain debug only code. These are converted to be debug-or-warn only code so that we still get all the assert checks in the code. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers --- fs/xfs/Kconfig | 13 +++++++++++++ fs/xfs/mrlock.h | 12 ++++++------ fs/xfs/xfs.h | 5 +++++ fs/xfs/xfs_alloc_btree.c | 4 ++-- fs/xfs/xfs_bmap_btree.c | 4 ++-- fs/xfs/xfs_btree.h | 2 +- fs/xfs/xfs_dir2_node.c | 4 ++-- fs/xfs/xfs_ialloc_btree.c | 4 ++-- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_linux.h | 24 ++++++++++++++++++------ fs/xfs/xfs_message.c | 8 ++++++++ fs/xfs/xfs_message.h | 1 + fs/xfs/xfs_trans.h | 4 ++-- 13 files changed, 63 insertions(+), 24 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index cc33aaf219f..399e8cec6e6 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -69,6 +69,19 @@ config XFS_RT If unsure, say N. +config XFS_WARN + bool "XFS Verbose Warnings" + depends on XFS_FS && !XFS_DEBUG + help + Say Y here to get an XFS build with many additional warnings. + It converts ASSERT checks to WARN, so will log any out-of-bounds + conditions that occur that would otherwise be missed. It is much + lighter weight than XFS_DEBUG and does not modify algorithms and will + not cause the kernel to panic on non-fatal errors. + + However, similar to XFS_DEBUG, it is only advisable to use this if you + are debugging a particular problem. + config XFS_DEBUG bool "XFS Debugging support" depends on XFS_FS diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h index ff6a19873e5..e3c92d19e54 100644 --- a/fs/xfs/mrlock.h +++ b/fs/xfs/mrlock.h @@ -22,12 +22,12 @@ typedef struct { struct rw_semaphore mr_lock; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int mr_writer; #endif } mrlock_t; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) #define mrinit(mrp, name) \ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) #else @@ -46,7 +46,7 @@ static inline void mraccess_nested(mrlock_t *mrp, int subclass) static inline void mrupdate_nested(mrlock_t *mrp, int subclass) { down_write_nested(&mrp->mr_lock, subclass); -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; #endif } @@ -60,7 +60,7 @@ static inline int mrtryupdate(mrlock_t *mrp) { if (!down_write_trylock(&mrp->mr_lock)) return 0; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; #endif return 1; @@ -68,7 +68,7 @@ static inline int mrtryupdate(mrlock_t *mrp) static inline void mrunlock_excl(mrlock_t *mrp) { -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 0; #endif up_write(&mrp->mr_lock); @@ -81,7 +81,7 @@ static inline void mrunlock_shared(mrlock_t *mrp) static inline void mrdemote(mrlock_t *mrp) { -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 0; #endif downgrade_write(&mrp->mr_lock); diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index d8b11b7f94a..a742c47f7d5 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -24,6 +24,11 @@ #define XFS_BUF_LOCK_TRACKING 1 #endif +#ifdef CONFIG_XFS_WARN +#define XFS_WARN 1 +#endif + + #include "xfs_linux.h" #endif /* __XFS_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 30c4c1434fa..cafc90251d1 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -386,7 +386,7 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_allocbt_keys_inorder( struct xfs_btree_cur *cur, @@ -442,7 +442,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, #endif diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 3a86c3fa6de..0c61a22be6f 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -813,7 +813,7 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_bmbt_keys_inorder( struct xfs_btree_cur *cur, @@ -853,7 +853,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .buf_ops = &xfs_bmbt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, #endif diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 6e6c915673f..55e3c7cc3c3 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -215,7 +215,7 @@ struct xfs_btree_ops { const struct xfs_buf_ops *buf_ops; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, union xfs_btree_key *k1, diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index ecc6c661064..5246de4912d 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -993,7 +993,7 @@ xfs_dir2_leafn_rebalance( xfs_dir2_leaf_t *leaf1; /* first leaf structure */ xfs_dir2_leaf_t *leaf2; /* second leaf structure */ int mid; /* midpoint leaf index */ -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int oldstale; /* old count of stale leaves */ #endif int oldsum; /* old total leaf count */ @@ -1022,7 +1022,7 @@ xfs_dir2_leafn_rebalance( ents2 = xfs_dir3_leaf_ents_p(leaf2); oldsum = hdr1.count + hdr2.count; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) oldstale = hdr1.stale + hdr2.stale; #endif mid = oldsum >> 1; diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index c82ac886742..5448eb6b8c1 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -272,7 +272,7 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { .verify_write = xfs_inobt_write_verify, }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_inobt_keys_inorder( struct xfs_btree_cur *cur, @@ -310,7 +310,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_inobt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 558ef494720..efbe1accb6c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -287,7 +287,7 @@ xfs_ilock_demote( trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int xfs_isilocked( xfs_inode_t *ip, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 14e59d953b7..800f896a6cc 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -293,22 +293,34 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #define ASSERT_ALWAYS(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -#ifndef DEBUG -#define ASSERT(expr) ((void)0) +#ifdef DEBUG +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #ifndef STATIC -# define STATIC static noinline +# define STATIC noinline #endif -#else /* DEBUG */ +#else /* !DEBUG */ + +#ifdef XFS_WARN #define ASSERT(expr) \ - (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) #ifndef STATIC -# define STATIC noinline +# define STATIC static noinline +#endif + +#else /* !DEBUG && !XFS_WARN */ + +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline #endif +#endif /* XFS_WARN */ #endif /* DEBUG */ #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 331cd9f83a7..9163dc14053 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -92,6 +92,14 @@ xfs_alert_tag( BUG_ON(do_panic); } +void +asswarn(char *expr, char *file, int line) +{ + xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + WARN_ON(1); +} + void assfail(char *expr, char *file, int line) { diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 76c81982f96..85401155750 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -57,6 +57,7 @@ do { \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) extern void assfail(char *expr, char *f, int l); +extern void asswarn(char *expr, char *f, int l); extern void xfs_hex_dump(void *p, int length); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index cd29f617102..a44dba5b2cd 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -405,7 +405,7 @@ typedef struct xfs_trans { int64_t t_res_fdblocks_delta; /* on-disk only chg */ int64_t t_frextents_delta;/* superblock freextents chg*/ int64_t t_res_frextents_delta; /* on-disk only chg */ -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int64_t t_ag_freeblks_delta; /* debugging counter */ int64_t t_ag_flist_delta; /* debugging counter */ int64_t t_ag_btree_delta; /* debugging counter */ @@ -433,7 +433,7 @@ typedef struct xfs_trans { #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) #define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) #define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) #define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) -- cgit v1.2.3-70-g09d2