From 527851124d10f9c50b1c578e0a56fcd49922422d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Feb 2015 11:49:23 +1100 Subject: xfs: implement pNFS export operations Add operations to export pNFS block layouts from an XFS filesystem. See the previous commit adding the operations for an explanation of them. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/Makefile | 1 + fs/xfs/xfs_export.c | 6 ++ fs/xfs/xfs_fsops.c | 6 ++ fs/xfs/xfs_iops.c | 2 +- fs/xfs/xfs_iops.h | 1 + fs/xfs/xfs_mount.h | 11 ++ fs/xfs/xfs_pnfs.c | 292 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_pnfs.h | 11 ++ 8 files changed, 329 insertions(+), 1 deletion(-) create mode 100644 fs/xfs/xfs_pnfs.c create mode 100644 fs/xfs/xfs_pnfs.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d6179994958..df6828570e8 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -121,3 +121,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_PROC_FS) += xfs_stats.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o +xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 5eb4a14e0a0..b97359ba264 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -30,6 +30,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" +#include "xfs_pnfs.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -245,4 +246,9 @@ const struct export_operations xfs_export_operations = { .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, .commit_metadata = xfs_fs_nfs_commit_metadata, +#ifdef CONFIG_NFSD_PNFS + .get_uuid = xfs_fs_get_uuid, + .map_blocks = xfs_fs_map_blocks, + .commit_blocks = xfs_fs_commit_blocks, +#endif }; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index fba6532efba..74efe5b760d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -602,6 +602,12 @@ xfs_growfs_data( if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; error = xfs_growfs_data_private(mp, in); + /* + * Increment the generation unconditionally, the error could be from + * updating the secondary superblocks, in which case the new size + * is live already. + */ + mp->m_generation++; mutex_unlock(&mp->m_growlock); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ce80eeb8faa..e5e2ea0d0b2 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -505,7 +505,7 @@ xfs_setattr_mode( inode->i_mode |= mode & ~S_IFMT; } -static void +void xfs_setattr_time( struct xfs_inode *ip, struct iattr *iattr) diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 1c34e433592..ea7a98e9cb7 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -32,6 +32,7 @@ extern void xfs_setup_inode(struct xfs_inode *); */ #define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ +extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a5b2ff82265..0d8abd6364d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -174,6 +174,17 @@ typedef struct xfs_mount { struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; struct workqueue_struct *m_eofblocks_workqueue; + + /* + * Generation of the filesysyem layout. This is incremented by each + * growfs, and used by the pNFS server to ensure the client updates + * its view of the block device once it gets a layout that might + * reference the newly added blocks. Does not need to be persistent + * as long as we only allow file system size increments, but if we + * ever support shrinks it would have to be persisted in addition + * to various other kinds of pain inflicted on the pNFS server. + */ + __uint32_t m_generation; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c new file mode 100644 index 00000000000..89912b34f18 --- /dev/null +++ b/fs/xfs/xfs_pnfs.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_iomap.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_pnfs.h" + +/* + * Get a unique ID including its location so that the client can identify + * the exported device. + */ +int +xfs_fs_get_uuid( + struct super_block *sb, + u8 *buf, + u32 *len, + u64 *offset) +{ + struct xfs_mount *mp = XFS_M(sb); + + printk_once(KERN_NOTICE +"XFS (%s): using experimental pNFS feature, use at your own risk!\n", + mp->m_fsname); + + if (*len < sizeof(uuid_t)) + return -EINVAL; + + memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + *len = sizeof(uuid_t); + *offset = offsetof(struct xfs_dsb, sb_uuid); + return 0; +} + +static void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = + XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); +} + +/* + * Get a layout for the pNFS client. + */ +int +xfs_fs_map_blocks( + struct inode *inode, + loff_t offset, + u64 length, + struct iomap *iomap, + bool write, + u32 *device_generation) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + loff_t limit; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + uint lock_flags; + int error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * We can't export inodes residing on the realtime device. The realtime + * device doesn't have a UUID to identify it, so the client has no way + * to find it. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return -ENXIO; + + /* + * Lock out any other I/O before we flush and invalidate the pagecache, + * and then hand out a layout to the remote system. This is very + * similar to direct I/O, except that the synchronization is much more + * complicated. See the comment near xfs_break_layouts for a detailed + * explanation. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + error = -EINVAL; + limit = mp->m_super->s_maxbytes; + if (!write) + limit = max(limit, round_up(i_size_read(inode), + inode->i_sb->s_blocksize)); + if (offset > limit) + goto out_unlock; + if (offset > limit - length) + length = limit - offset; + + error = filemap_write_and_wait(inode->i_mapping); + if (error) + goto out_unlock; + error = invalidate_inode_pages2(inode->i_mapping); + if (WARN_ON_ONCE(error)) + return error; + + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + lock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, bmapi_flags); + xfs_iunlock(ip, lock_flags); + + if (error) + goto out_unlock; + + if (write) { + enum xfs_prealloc_flags flags = 0; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { + error = xfs_iomap_write_direct(ip, offset, length, + &imap, nimaps); + if (error) + goto out_unlock; + + /* + * Ensure the next transaction is committed + * synchronously so that the blocks allocated and + * handed out to the client are guaranteed to be + * present even after a server crash. + */ + flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; + } + + error = xfs_update_prealloc_flags(ip, flags); + if (error) + goto out_unlock; + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + xfs_bmbt_to_iomap(ip, iomap, &imap); + *device_generation = mp->m_generation; + return error; +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Ensure the size update falls into a valid allocated block. + */ +static int +xfs_pnfs_validate_isize( + struct xfs_inode *ip, + xfs_off_t isize) +{ + struct xfs_bmbt_irec imap; + int nimaps = 1; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1, + &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + return error; + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) + return -EIO; + return 0; +} + +/* + * Make sure the blocks described by maps are stable on disk. This includes + * converting any unwritten extents, flushing the disk cache and updating the + * time stamps. + * + * Note that we rely on the caller to always send us a timestamp update so that + * we always commit a transaction here. If that stops being true we will have + * to manually flush the cache here similar to what the fsync code path does + * for datasyncs on files that have no dirty metadata. + */ +int +xfs_fs_commit_blocks( + struct inode *inode, + struct iomap *maps, + int nr_maps, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + bool update_isize = false; + int error, i; + loff_t size; + + ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)); + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + size = i_size_read(inode); + if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) { + update_isize = true; + size = iattr->ia_size; + } + + for (i = 0; i < nr_maps; i++) { + u64 start, length, end; + + start = maps[i].offset; + if (start > size) + continue; + + end = start + maps[i].length; + if (end > size) + end = size; + + length = end - start; + if (!length) + continue; + + /* + * Make sure reads through the pagecache see the new data. + */ + error = invalidate_inode_pages2_range(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + (end - 1) >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(error); + + error = xfs_iomap_write_unwritten(ip, start, length); + if (error) + goto out_drop_iolock; + } + + if (update_isize) { + error = xfs_pnfs_validate_isize(ip, size); + if (error) + goto out_drop_iolock; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) + goto out_drop_iolock; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + xfs_setattr_time(ip, iattr); + if (update_isize) { + i_size_write(inode, iattr->ia_size); + ip->i_d.di_size = iattr->ia_size; + } + + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +out_drop_iolock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h new file mode 100644 index 00000000000..0d91255a89a --- /dev/null +++ b/fs/xfs/xfs_pnfs.h @@ -0,0 +1,11 @@ +#ifndef _XFS_PNFS_H +#define _XFS_PNFS_H 1 + +#ifdef CONFIG_NFSD_PNFS +int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); +int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, + struct iomap *iomap, bool write, u32 *device_generation); +int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, + struct iattr *iattr); +#endif /* CONFIG_NFSD_PNFS */ +#endif /* _XFS_PNFS_H */ -- cgit v1.2.3-70-g09d2 From 781355c6e5ae87908de27dec3380a34918c33eee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Feb 2015 11:59:50 +1100 Subject: xfs: recall pNFS layouts on conflicting access Recall all outstanding pNFS layouts and truncates, writes and similar extent list modifying operations. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 14 ++++++++++++-- fs/xfs/xfs_ioctl.c | 9 +++++++-- fs/xfs/xfs_iops.c | 11 ++++++++--- fs/xfs/xfs_pnfs.c | 30 ++++++++++++++++++++++++++++++ fs/xfs/xfs_pnfs.h | 7 +++++++ 5 files changed, 64 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 712d312d8e3..56dcfce8d7d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_icache.h" +#include "xfs_pnfs.h" #include #include @@ -554,6 +555,10 @@ restart: if (error) return error; + error = xfs_break_layouts(inode, iolock); + if (error) + return error; + /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this @@ -822,6 +827,7 @@ xfs_file_fallocate( struct xfs_inode *ip = XFS_I(inode); long error; enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; if (!S_ISREG(inode->i_mode)) @@ -830,7 +836,11 @@ xfs_file_fallocate( FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -894,7 +904,7 @@ xfs_file_fallocate( } out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, iolock); return error; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f7afb86c914..bf70a2affb0 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -39,6 +39,7 @@ #include "xfs_icache.h" #include "xfs_symlink.h" #include "xfs_trans.h" +#include "xfs_pnfs.h" #include #include @@ -608,6 +609,7 @@ xfs_ioc_space( { struct iattr iattr; enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; int error; /* @@ -636,7 +638,10 @@ xfs_ioc_space( if (error) return error; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock); + if (error) + goto out_unlock; switch (bf->l_whence) { case 0: /*SEEK_SET*/ @@ -725,7 +730,7 @@ xfs_ioc_space( error = xfs_update_prealloc_flags(ip, flags); out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, iolock); mnt_drop_write_file(filp); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e5e2ea0d0b2..d919ad7b16b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -37,6 +37,7 @@ #include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_trans_space.h" +#include "xfs_pnfs.h" #include #include @@ -979,9 +980,13 @@ xfs_vn_setattr( int error; if (iattr->ia_valid & ATTR_SIZE) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - error = xfs_setattr_size(ip, iattr); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + uint iolock = XFS_IOLOCK_EXCL; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(dentry->d_inode, &iolock); + if (!error) + error = xfs_setattr_size(ip, iattr); + xfs_iunlock(ip, iolock); } else { error = xfs_setattr_nonsize(ip, iattr, 0); } diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 89912b34f18..4b33ef11240 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -18,6 +18,36 @@ #include "xfs_bit.h" #include "xfs_pnfs.h" +/* + * Ensure that we do not have any outstanding pNFS layouts that can be used by + * clients to directly read from or write to this inode. This must be called + * before every operation that can remove blocks from the extent map. + * Additionally we call it during the write operation, where aren't concerned + * about exposing unallocated blocks but just want to provide basic + * synchronization between a local writer and pNFS clients. mmap writes would + * also benefit from this sort of synchronization, but due to the tricky locking + * rules in the page fault path we don't bother. + */ +int +xfs_break_layouts( + struct inode *inode, + uint *iolock) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + + while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { + xfs_iunlock(ip, *iolock); + error = break_layout(inode, true); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + } + + return error; +} + /* * Get a unique ID including its location so that the client can identify * the exported device. diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index 0d91255a89a..b7fbfce660f 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -7,5 +7,12 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, struct iomap *iomap, bool write, u32 *device_generation); int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); + +int xfs_break_layouts(struct inode *inode, uint *iolock); +#else +static inline int xfs_break_layouts(struct inode *inode, uint *iolock) +{ + return 0; +} #endif /* CONFIG_NFSD_PNFS */ #endif /* _XFS_PNFS_H */ -- cgit v1.2.3-70-g09d2