From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2011 08:52:07 +0100 Subject: block: remove per-queue plugging Code has been converted over to the new explicit on-stack plugging, and delay users have been converted to use the new API for that. So lets kill off the old plugging along with aops->sync_page(). Signed-off-by: Jens Axboe --- fs/exofs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/exofs/inode.c') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index a7555238c41..82b94c8f5d2 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -795,7 +795,6 @@ const struct address_space_operations exofs_aops = { .direct_IO = NULL, /* TODO: Should be trivial to do */ /* With these NULL has special meaning or default is not exported */ - .sync_page = NULL, .get_xip_mem = NULL, .migratepage = NULL, .launder_page = NULL, -- cgit v1.2.3-70-g09d2 From a8f1418f9e9bd4c487a7b703ff26c5dd5ceb2bf3 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 22 Nov 2010 18:02:45 +0200 Subject: exofs: Optimize read_4_write Don't attempt a read passed i_size, just zero the page and be done with it. Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'fs/exofs/inode.c') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index a7555238c41..c8f58a96e59 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -350,8 +350,10 @@ static int readpage_strip(void *data, struct page *page) if (!pcol->read_4_write) unlock_page(page); - EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," - " splitting\n", inode->i_ino, page->index); + EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx " + "read_4_write=%d index=0x%lx end_index=0x%lx " + "splitting\n", inode->i_ino, len, + pcol->read_4_write, page->index, end_index); return read_exec(pcol); } @@ -722,11 +724,28 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, /* read modify write */ if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + loff_t i_size = i_size_read(mapping->host); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t rlen; + + if (page->index < end_index) + rlen = PAGE_CACHE_SIZE; + else if (page->index == end_index) + rlen = i_size & ~PAGE_CACHE_MASK; + else + rlen = 0; + + if (!rlen) { + clear_highpage(page); + SetPageUptodate(page); + goto out; + } + ret = _readpage(page, true); if (ret) { /*SetPageError was done by _readpage. Is it ok?*/ unlock_page(page); - EXOFS_DBGMSG("__readpage_filler failed\n"); + EXOFS_DBGMSG("__readpage failed\n"); } } out: -- cgit v1.2.3-70-g09d2 From 97178b7b6c84bd14660b89474d27931a1ea65c66 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 25 Nov 2010 12:47:15 +0200 Subject: exofs: simple fsync race fix It is incorrect to test inode dirty bits without participating in the inode writeback protocol. Inode writeback sets I_SYNC and clears I_DIRTY_?, then writes out the particular bits, then clears I_SYNC when it is done. BTW. it may not completely write all pages out, so I_DIRTY_PAGES would get set again. This is a standard pattern used throughout the kernel's writeback caches (I_SYNC ~= I_WRITEBACK, if that makes it clearer). And so it is not possible to determine an inode's dirty status just by checking I_DIRTY bits. Especially not for the purpose of data integrity syncs. Missing the check for these bits means that fsync can complete while writeback to the inode is underway. Inode writeback functions get this right, so call into them rather than try to shortcut things by testing dirty state improperly. Signed-off-by: Nick Piggin Signed-off-by: Boaz Harrosh --- fs/exofs/file.c | 5 ----- fs/exofs/inode.c | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/exofs/inode.c') diff --git a/fs/exofs/file.c b/fs/exofs/file.c index b905c79b4f0..4c0d6bac914 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -48,11 +48,6 @@ static int exofs_file_fsync(struct file *filp, int datasync) struct inode *inode = filp->f_mapping->host; struct super_block *sb; - if (!(inode->i_state & I_DIRTY)) - return 0; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return 0; - ret = sync_inode_metadata(inode, 1); /* This is a good place to write the sb */ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index c8f58a96e59..fb9d3805610 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1290,7 +1290,8 @@ out: int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); + /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */ + return exofs_update_inode(inode, 1); } /* -- cgit v1.2.3-70-g09d2 From 66cd6cad4919f980dd21307d0150ff251762a264 Mon Sep 17 00:00:00 2001 From: "bharrosh@panasas.com" Date: Thu, 7 Oct 2010 14:28:18 -0400 Subject: exofs: Override read-ahead to align on stripe_size * Set all inode->i_mapping->backing_dev_info to point to the per super-block sb->s_bdi. * Calculating a read_ahead that is: - preferable 2 stripes long (Future patch will add a mount option to override this) - Minimum 128K aligned up to stripe-size - Caped to maximum-IO-sizes round down to stripe_size. (Max sizes are governed by max bio-size that fits in a page times number-of-devices) CC: Marc Dionne Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 2 ++ fs/exofs/inode.c | 19 +++++++++++++++---- fs/exofs/super.c | 18 ++++++++++++++++++ 3 files changed, 35 insertions(+), 4 deletions(-) (limited to 'fs/exofs/inode.c') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 2dc925fa101..99fcb9126a9 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -256,6 +256,8 @@ static inline int exofs_oi_read(struct exofs_i_info *oi, } /* inode.c */ +unsigned exofs_max_io_pages(struct exofs_layout *layout, + unsigned expected_pages); int exofs_setattr(struct dentry *, struct iattr *); int exofs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index fb9d3805610..681b3cb9b4d 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -43,6 +43,17 @@ enum { BIO_MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; +unsigned exofs_max_io_pages(struct exofs_layout *layout, + unsigned expected_pages) +{ + unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); + + /* TODO: easily support bio chaining */ + pages = min_t(unsigned, pages, + layout->group_width * BIO_MAX_PAGES_KMALLOC); + return pages; +} + struct page_collect { struct exofs_sb_info *sbi; struct inode *inode; @@ -97,8 +108,7 @@ static void _pcol_reset(struct page_collect *pcol) static int pcol_try_alloc(struct page_collect *pcol) { - unsigned pages = min_t(unsigned, pcol->expected_pages, - MAX_PAGES_KMALLOC); + unsigned pages; if (!pcol->ios) { /* First time allocate io_state */ int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); @@ -108,8 +118,7 @@ static int pcol_try_alloc(struct page_collect *pcol) } /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, - pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC); + pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); for (; pages; pages >>= 1) { pcol->pages = kmalloc(pages * sizeof(struct page *), @@ -1049,6 +1058,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); } + inode->i_mapping->backing_dev_info = sb->s_bdi; if (S_ISREG(inode->i_mode)) { inode->i_op = &exofs_file_inode_operations; inode->i_fop = &exofs_file_operations; @@ -1149,6 +1159,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) sbi = sb->s_fs_info; + inode->i_mapping->backing_dev_info = sb->s_bdi; sb->s_dirt = 1; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 78f5ad633d3..e87510f4749 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -390,6 +390,23 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, return 0; } +static unsigned __ra_pages(struct exofs_layout *layout) +{ + const unsigned _MIN_RA = 32; /* min 128K read-ahead */ + unsigned ra_pages = layout->group_width * layout->stripe_unit / + PAGE_SIZE; + unsigned max_io_pages = exofs_max_io_pages(layout, ~0); + + ra_pages *= 2; /* two stripes */ + if (ra_pages < _MIN_RA) + ra_pages = roundup(_MIN_RA, ra_pages / 2); + + if (ra_pages > max_io_pages) + ra_pages = max_io_pages; + + return ra_pages; +} + /* @odi is valid only as long as @fscb_dev is valid */ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, struct osd_dev_info *odi) @@ -623,6 +640,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) } /* set up operation vectors */ + sbi->bdi.ra_pages = __ra_pages(&sbi->layout); sb->s_bdi = &sbi->bdi; sb->s_fs_info = sbi; sb->s_op = &exofs_sops; -- cgit v1.2.3-70-g09d2 From 1cea312ad49d9cb964179a784fedb1fcfe396283 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 3 Feb 2011 17:53:25 +0200 Subject: exofs: Write sbi->s_nextid as part of the Create command Before when creating a new inode, we'd set the sb->s_dirt flag, and sometime later the system would write out s_nextid as part of the sb_info. Also on inode sync we would force the sb sync as well. Define the s_nextid as a new partition attribute and set it every time we create a new object. At mount we read it from it's new place. We now never set sb->s_dirt anywhere in exofs. write_super is actually never called. The call to exofs_write_super from exofs_put_super is also removed because the VFS always calls ->sync_fs before calling ->put_super twice. To stay backward-and-forward compatible we also write the old s_nextid in the super_block object at unmount, and support zero length attribute on mount. This also fixes a BUG where in layouts when group_width was not a divisor of EXOFS_SUPER_ID (0x10000) the s_nextid was not read from the device it was written to. Because of the sliding window layout trick, and because the read was always done from the 0 device but the write was done via the raid engine that might slide the device view. Now we read and write through the raid engine. Signed-off-by: Boaz Harrosh --- fs/exofs/common.h | 18 +++++++- fs/exofs/exofs.h | 4 +- fs/exofs/file.c | 11 +---- fs/exofs/inode.c | 4 +- fs/exofs/super.c | 135 +++++++++++++++++++++++++++++++++++++++++++++++------- 5 files changed, 141 insertions(+), 31 deletions(-) (limited to 'fs/exofs/inode.c') diff --git a/fs/exofs/common.h b/fs/exofs/common.h index f0d520312d8..5e74ad3d400 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -53,10 +53,14 @@ #define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ /* exofs Application specific page/attribute */ +/* Inode attrs */ # define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) # define EXOFS_ATTR_INODE_DATA 1 # define EXOFS_ATTR_INODE_FILE_LAYOUT 2 # define EXOFS_ATTR_INODE_DIR_LAYOUT 3 +/* Partition attrs */ +# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3) +# define EXOFS_ATTR_SB_STATS 1 /* * The maximum number of files we can have is limited by the size of the @@ -86,8 +90,8 @@ enum { */ enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; struct exofs_fscb { - __le64 s_nextid; /* Highest object ID used */ - __le64 s_numfiles; /* Number of files on fs */ + __le64 s_nextid; /* Only used after mkfs */ + __le64 s_numfiles; /* Only used after mkfs */ __le32 s_version; /* == EXOFS_FSCB_VER */ __le16 s_magic; /* Magic signature */ __le16 s_newfs; /* Non-zero if this is a new fs */ @@ -97,6 +101,16 @@ struct exofs_fscb { __le64 s_dev_table_count; /* == 0 means no dev_table */ } __packed; +/* + * This struct is set on the FS partition's attributes. + * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together + * with the create command, to atomically persist the sb writeable information. + */ +struct exofs_sb_stats { + __le64 s_nextid; /* Highest object ID used */ + __le64 s_numfiles; /* Number of files on fs */ +} __packed; + /* * Describes the raid used in the FS. It is part of the device table. * This here is taken from the pNFS-objects definition. In exofs we diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 99fcb9126a9..c965806c282 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -77,7 +77,7 @@ struct exofs_layout { * our extension to the in-memory superblock */ struct exofs_sb_info { - struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ + struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ uint32_t s_numfiles; /* number of files on fs */ @@ -281,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, struct inode *); /* super.c */ -int exofs_sync_fs(struct super_block *sb, int wait); +int exofs_sbi_write_stats(struct exofs_sb_info *sbi); /********************* * operation vectors * diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 4c0d6bac914..45ca323d836 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -45,17 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp) static int exofs_file_fsync(struct file *filp, int datasync) { int ret; - struct inode *inode = filp->f_mapping->host; - struct super_block *sb; - - ret = sync_inode_metadata(inode, 1); - - /* This is a good place to write the sb */ - /* TODO: Sechedule an sb-sync on create */ - sb = inode->i_sb; - if (sb->s_dirt) - exofs_sync_fs(sb, 1); + ret = sync_inode_metadata(filp->f_mapping->host, 1); return ret; } diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 681b3cb9b4d..0c713cfbebf 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1102,6 +1102,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi) } return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; } + /* * Callback function from exofs_new_inode(). The important thing is that we * set the obj_created flag so that other methods know that the object exists on @@ -1160,7 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) sbi = sb->s_fs_info; inode->i_mapping->backing_dev_info = sb->s_bdi; - sb->s_dirt = 1; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; inode->i_blkbits = EXOFS_BLKSHIFT; @@ -1171,6 +1171,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) spin_unlock(&sbi->s_next_gen_lock); insert_inode_hash(inode); + exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ + mark_inode_dirty(inode); ret = exofs_get_io_state(&sbi->layout, &ios); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 474989eeb7d..5eb0851e548 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -213,6 +213,101 @@ static void destroy_inodecache(void) static const struct super_operations exofs_sops; static const struct export_operations exofs_export_ops; +static const struct osd_attr g_attr_sb_stats = ATTR_DEF( + EXOFS_APAGE_SB_DATA, + EXOFS_ATTR_SB_STATS, + sizeof(struct exofs_sb_stats)); + +static int __sbi_read_stats(struct exofs_sb_info *sbi) +{ + struct osd_attr attrs[] = { + [0] = g_attr_sb_stats, + }; + struct exofs_io_state *ios; + int ret; + + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + return ret; + } + + ios->cred = sbi->s_cred; + + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = exofs_sbi_read(ios); + if (unlikely(ret)) { + EXOFS_ERR("Error reading super_block stats => %d\n", ret); + goto out; + } + + ret = extract_attr_from_ios(ios, &attrs[0]); + if (ret) { + EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__); + goto out; + } + if (attrs[0].len) { + struct exofs_sb_stats *ess; + + if (unlikely(attrs[0].len != sizeof(*ess))) { + EXOFS_ERR("%s: Wrong version of exofs_sb_stats " + "size(%d) != expected(%zd)\n", + __func__, attrs[0].len, sizeof(*ess)); + goto out; + } + + ess = attrs[0].val_ptr; + sbi->s_nextid = le64_to_cpu(ess->s_nextid); + sbi->s_numfiles = le32_to_cpu(ess->s_numfiles); + } + +out: + exofs_put_io_state(ios); + return ret; +} + +static void stats_done(struct exofs_io_state *ios, void *p) +{ + exofs_put_io_state(ios); + /* Good thanks nothing to do anymore */ +} + +/* Asynchronously write the stats attribute */ +int exofs_sbi_write_stats(struct exofs_sb_info *sbi) +{ + struct osd_attr attrs[] = { + [0] = g_attr_sb_stats, + }; + struct exofs_io_state *ios; + int ret; + + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + return ret; + } + + sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid); + sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); + attrs[0].val_ptr = &sbi->s_ess; + + ios->cred = sbi->s_cred; + ios->done = stats_done; + ios->private = sbi; + ios->out_attr = attrs; + ios->out_attr_len = ARRAY_SIZE(attrs); + + ret = exofs_sbi_write(ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); + exofs_put_io_state(ios); + } + + return ret; +} + /* * Write the superblock to the OSD */ @@ -223,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait) struct exofs_io_state *ios; int ret = -ENOMEM; - lock_super(sb); + fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); + if (unlikely(!fscb)) + return -ENOMEM; + sbi = sb->s_fs_info; - fscb = &sbi->s_fscb; + /* NOTE: We no longer dirty the super_block anywhere in exofs. The + * reason we write the fscb here on unmount is so we can stay backwards + * compatible with fscb->s_version == 1. (What we are not compatible + * with is if a new version FS crashed and then we try to mount an old + * version). Otherwise the exofs_fscb is read-only from mkfs time. All + * the writeable info is set in exofs_sbi_write_stats() above. + */ ret = exofs_get_io_state(&sbi->layout, &ios); - if (ret) + if (unlikely(ret)) goto out; - /* Note: We only write the changing part of the fscb. .i.e upto the - * the fscb->s_dev_table_oid member. There is no read-modify-write - * here. - */ + lock_super(sb); + ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); memset(fscb, 0, ios->length); fscb->s_nextid = cpu_to_le64(sbi->s_nextid); @@ -249,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait) ios->cred = sbi->s_cred; ret = exofs_sbi_write(ios); - if (unlikely(ret)) { + if (unlikely(ret)) EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); - goto out; - } - sb->s_dirt = 0; + else + sb->s_dirt = 0; + + unlock_super(sb); out: EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); exofs_put_io_state(ios); - unlock_super(sb); + kfree(fscb); return ret; } @@ -302,9 +405,6 @@ static void exofs_put_super(struct super_block *sb) int num_pend; struct exofs_sb_info *sbi = sb->s_fs_info; - if (sb->s_dirt) - exofs_write_super(sb); - /* make sure there are no pending commands */ for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; num_pend = atomic_read(&sbi->s_curr_pending)) { @@ -629,6 +729,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; sb->s_magic = le16_to_cpu(fscb.s_magic); + /* NOTE: we read below to be backward compatible with old versions */ sbi->s_nextid = le64_to_cpu(fscb.s_nextid); sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); @@ -639,7 +740,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = -EINVAL; goto free_sbi; } - if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) { + if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) { EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); ret = -EINVAL; @@ -657,6 +758,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } + __sbi_read_stats(sbi); + /* set up operation vectors */ sbi->bdi.ra_pages = __ra_pages(&sbi->layout); sb->s_bdi = &sbi->bdi; -- cgit v1.2.3-70-g09d2