From 518f167a37b3c53f3cf44d27800455ca24e920f6 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 21 Jan 2010 20:00:02 +0200 Subject: exofs: Micro-optimize exofs_i_info optimize the exofs_i_info struct usage by moving the embedded vfs_inode to be first. A compiler might optimize away an "add" operation with constant zero. (Which it cannot with other constants) Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/exofs/exofs.h') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index c35fd462398..13663da2b11 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -78,13 +78,13 @@ struct exofs_sb_info { * our extension to the in-memory inode */ struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ unsigned long i_flags; /* various atomic flags */ uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ uint32_t i_dir_start_lookup; /* which page to start lookup */ - wait_queue_head_t i_wq; /* wait queue for inode */ uint64_t i_commit_size; /* the object's written length */ uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ - struct inode vfs_inode; /* normal in-memory inode */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) -- cgit v1.2.3-70-g09d2 From 45d3abcb1a7388b2b97582e13bf9dd21784dcaa5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 28 Jan 2010 11:46:16 +0200 Subject: exofs: Move layout related members to a layout structure * Abstract away those members in exofs_sb_info that are related/needed by a layout into a new exofs_layout structure. Embed it in exofs_sb_info. * At exofs_io_state receive/keep a pointer to an exofs_layout. No need for an exofs_sb_info pointer, all we need is at exofs_layout. * Change any usage of above exofs_sb_info members to their new name. Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 24 ++++++++++++++++++------ fs/exofs/inode.c | 14 +++++++------- fs/exofs/ios.c | 36 +++++++++++++++++++----------------- fs/exofs/super.c | 51 +++++++++++++++++++++++++++------------------------ 4 files changed, 71 insertions(+), 54 deletions(-) (limited to 'fs/exofs/exofs.h') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 13663da2b11..33c68568b33 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -55,12 +55,18 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) +struct exofs_layout { + osd_id s_pid; /* partition ID of file system*/ + + unsigned s_numdevs; /* Num of devices in array */ + struct osd_dev *s_ods[0]; /* Variable length */ +}; + /* * our extension to the in-memory superblock */ struct exofs_sb_info { struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ - osd_id s_pid; /* partition ID of file system*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ uint32_t s_numfiles; /* number of files on fs */ @@ -69,9 +75,14 @@ struct exofs_sb_info { atomic_t s_curr_pending; /* number of pending commands */ uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ - struct pnfs_osd_data_map data_map; /* Default raid to use */ - unsigned s_numdevs; /* Num of devices in array */ - struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */ + struct pnfs_osd_data_map data_map; /* Default raid to use + * FIXME: Needed ? + */ +/* struct exofs_layout dir_layout;*/ /* Default dir layout */ + struct exofs_layout layout; /* Default files layout, + * contains the variable osd_dev + * array. Keep last */ + struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; /* @@ -101,7 +112,7 @@ struct exofs_io_state { void *private; exofs_io_done_fn done; - struct exofs_sb_info *sbi; + struct exofs_layout *layout; struct osd_obj_id obj; u8 *cred; @@ -189,7 +200,8 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, u64 offset, void *p, unsigned length); -int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios); +int exofs_get_io_state(struct exofs_layout *layout, + struct exofs_io_state **ios); void exofs_put_io_state(struct exofs_io_state *ios); int exofs_check_io(struct exofs_io_state *ios, u64 *resid); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index c88a0c5250c..03189a958b3 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -65,7 +65,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, /* Create master bios on first Q, later on cloning, each clone will be * allocated on it's destination Q */ - pcol->req_q = osd_request_queue(sbi->s_ods[0]); + pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]); pcol->inode = inode; pcol->expected_pages = expected_pages; @@ -99,7 +99,7 @@ static int pcol_try_alloc(struct page_collect *pcol) BIO_MAX_PAGES_KMALLOC); if (!pcol->ios) { /* First time allocate io_state */ - int ret = exofs_get_io_state(pcol->sbi, &pcol->ios); + int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); if (ret) return ret; @@ -872,7 +872,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, int ret; *obj_size = ~0; - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); return ret; @@ -1043,7 +1043,7 @@ static void create_done(struct exofs_io_state *ios, void *p) if (unlikely(ret)) { EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", - _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid)); + _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); /*TODO: When FS is corrupted creation can fail, object already * exist. Get rid of this asynchronous creation, if exist * increment the obj counter and try the next object. Until we @@ -1104,7 +1104,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) mark_inode_dirty(inode); - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); return ERR_PTR(ret); @@ -1202,7 +1202,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); goto free_args; @@ -1286,7 +1286,7 @@ void exofs_delete_inode(struct inode *inode) clear_inode(inode); - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); return; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 439c5d097b2..83e54a77b99 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -67,23 +67,24 @@ out: return ret; } -int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) +int exofs_get_io_state(struct exofs_layout *layout, + struct exofs_io_state **pios) { struct exofs_io_state *ios; /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(sbi->s_numdevs) + * exofs_io_state_size(layout->s_numdevs) */ - ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); + ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); if (unlikely(!ios)) { EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", - exofs_io_state_size(sbi->s_numdevs)); + exofs_io_state_size(layout->s_numdevs)); *pios = NULL; return -ENOMEM; } - ios->sbi = sbi; - ios->obj.partition = sbi->s_pid; + ios->layout = layout; + ios->obj.partition = layout->s_pid; *pios = ios; return 0; } @@ -238,10 +239,10 @@ int exofs_sbi_create(struct exofs_io_state *ios) { int i, ret; - for (i = 0; i < ios->sbi->s_numdevs; i++) { + for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -262,10 +263,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios) { int i, ret; - for (i = 0; i < ios->sbi->s_numdevs; i++) { + for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -286,10 +287,10 @@ int exofs_sbi_write(struct exofs_io_state *ios) { int i, ret; - for (i = 0; i < ios->sbi->s_numdevs; i++) { + for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -361,8 +362,9 @@ int exofs_sbi_read(struct exofs_io_state *ios) struct osd_request *or; unsigned first_dev = (unsigned)ios->obj.id; - first_dev %= ios->sbi->s_numdevs; - or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL); + first_dev %= ios->layout->s_numdevs; + or = osd_start_request(ios->layout->s_ods[first_dev], + GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -438,7 +440,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) __be64 newsize; int i, ret; - if (exofs_get_io_state(sbi, &ios)) + if (exofs_get_io_state(&sbi->layout, &ios)) return -ENOMEM; ios->obj.id = exofs_oi_objno(oi); @@ -448,10 +450,10 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) attr = g_attr_logical_length; attr.val_ptr = &newsize; - for (i = 0; i < sbi->s_numdevs; i++) { + for (i = 0; i < sbi->layout.s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index a1d1e77b12e..fc8875186ae 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -210,7 +210,7 @@ int exofs_sync_fs(struct super_block *sb, int wait) sbi = sb->s_fs_info; fscb = &sbi->s_fscb; - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (ret) goto out; @@ -264,12 +264,12 @@ static void _exofs_print_device(const char *msg, const char *dev_path, void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->s_numdevs) { - int i = --sbi->s_numdevs; - struct osd_dev *od = sbi->s_ods[i]; + while (sbi->layout.s_numdevs) { + int i = --sbi->layout.s_numdevs; + struct osd_dev *od = sbi->layout.s_ods[i]; if (od) { - sbi->s_ods[i] = NULL; + sbi->layout.s_ods[i] = NULL; osduld_put_device(od); } } @@ -298,7 +298,8 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid); + _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], + sbi->layout.s_pid); exofs_free_sbi(sbi); sb->s_fs_info = NULL; @@ -361,7 +362,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, { struct exofs_sb_info *sbi = *psbi; struct osd_dev *fscb_od; - struct osd_obj_id obj = {.partition = sbi->s_pid, + struct osd_obj_id obj = {.partition = sbi->layout.s_pid, .id = EXOFS_DEVTABLE_ID}; struct exofs_device_table *dt; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + @@ -376,9 +377,9 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, return -ENOMEM; } - fscb_od = sbi->s_ods[0]; - sbi->s_ods[0] = NULL; - sbi->s_numdevs = 0; + fscb_od = sbi->layout.s_ods[0]; + sbi->layout.s_ods[0] = NULL; + sbi->layout.s_numdevs = 0; ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); if (unlikely(ret)) { EXOFS_ERR("ERROR: reading device table\n"); @@ -397,14 +398,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, goto out; if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->s_ods[0]); + unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); if (unlikely(!sbi)) { ret = -ENOMEM; goto out; } - memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0])); + memset(&sbi->layout.s_ods[1], 0, + size - sizeof(sbi->layout.s_ods[0])); *psbi = sbi; } @@ -427,8 +429,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->s_ods[i] = fscb_od; - ++sbi->s_numdevs; + sbi->layout.s_ods[i] = fscb_od; + ++sbi->layout.s_numdevs; fscb_od = NULL; continue; } @@ -441,8 +443,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, goto out; } - sbi->s_ods[i] = od; - ++sbi->s_numdevs; + sbi->layout.s_ods[i] = od; + ++sbi->layout.s_numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. @@ -499,9 +501,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - sbi->s_ods[0] = od; - sbi->s_numdevs = 1; - sbi->s_pid = opts->pid; + /* Default layout in case we do not have a device-table */ + sbi->layout.s_ods[0] = od; + sbi->layout.s_numdevs = 1; + sbi->layout.s_pid = opts->pid; sbi->s_timeout = opts->timeout; /* fill in some other data by hand */ @@ -514,7 +517,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_bdev = NULL; sb->s_dev = 0; - obj.partition = sbi->s_pid; + obj.partition = sbi->layout.s_pid; obj.id = EXOFS_SUPER_ID; exofs_make_credential(sbi->s_cred, &obj); @@ -578,13 +581,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0], - sbi->s_pid); + _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], + sbi->layout.s_pid); return 0; free_sbi: EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", - opts->dev_name, sbi->s_pid, ret); + opts->dev_name, sbi->layout.s_pid, ret); exofs_free_sbi(sbi); return ret; } @@ -627,7 +630,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) uint8_t cred_a[OSD_CAP_LEN]; int ret; - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (ret) { EXOFS_DBGMSG("exofs_get_io_state failed.\n"); return ret; -- cgit v1.2.3-70-g09d2 From d9c740d2253e75db8cef8f87a3125c450f3ebd82 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 28 Jan 2010 11:58:08 +0200 Subject: exofs: Define on-disk per-inode optional layout attribute * Layouts describe the way a file is spread on multiple devices. The layout information is stored in the objects attribute introduced in this patch. * There can be multiple generating function for the layout. Currently defined: - No attribute present - use below moving-window on global device table, all devices. (This is the only one currently used in exofs) - an obj_id generated moving window - the obj_id is a randomizing factor in the otherwise global map layout. - An explicit layout stored, including a data_map and a device index list. - More might be defined in future ... * There are two attributes defined of the same structure: A-data-files-layout - This layout is used by data-files. If present at a directory, all files of that directory will be created with this layout. A-meta-data-layout - This layout is used by a directory and other meta-data information. Also inherited at creation of subdirectories. * At creation time inodes are created with the layout specified above. A usermode utility may change the creation layout on a give directory or file. Which in the case of directories, will also apply to newly created files/subdirectories, children of that directory. In the simple unaltered case of a newly created exofs, no layout attributes are present, and all layouts adhere to the layout specified at the device-table. * In case of a future file system loaded in an old exofs-driver. At iget(), the generating_function is inspected and if not supported will return an IO error to the application and the inode will not be loaded. So not to damage any data. Note: After this patch we do not yet support any type of layout only the RAID0 patch that enables striping at the super-block level will add support for RAID0 layouts above. This way we are past and future compatible and fully bisectable. * Access to the device table is done by an accessor since it will change according to above information. Signed-off-by: Boaz Harrosh --- fs/exofs/common.h | 39 ++++++++++++++++++++++++++++++++++++++ fs/exofs/exofs.h | 6 ++++++ fs/exofs/inode.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++----- fs/exofs/ios.c | 23 ++++++++++++++++++----- 4 files changed, 114 insertions(+), 10 deletions(-) (limited to 'fs/exofs/exofs.h') diff --git a/fs/exofs/common.h b/fs/exofs/common.h index b1b178e6171..f0d520312d8 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -55,6 +55,8 @@ /* exofs Application specific page/attribute */ # define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) # define EXOFS_ATTR_INODE_DATA 1 +# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 +# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 /* * The maximum number of files we can have is limited by the size of the @@ -206,4 +208,41 @@ enum { (((name_len) + offsetof(struct exofs_dir_entry, name) + \ EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) +/* + * The on-disk (optional) layout structure. + * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT + * attribute, attached to any inode, usually to a directory. + */ + +enum exofs_inode_layout_gen_functions { + LAYOUT_MOVING_WINDOW = 0, + LAYOUT_IMPLICT = 1, +}; + +struct exofs_on_disk_inode_layout { + __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */ + __le16 pad; + union { + /* gen_func == LAYOUT_MOVING_WINDOW (default) */ + struct exofs_layout_sliding_window { + __le32 num_devices; /* first n devices in global-table*/ + } sliding_window __packed; + + /* gen_func == LAYOUT_IMPLICT */ + struct exofs_layout_implict_list { + struct exofs_dt_data_map data_map; + /* Variable array of size data_map.cb_num_comps. These + * are device indexes of the devices in the global table + */ + __le32 dev_indexes[]; + } implict __packed; + }; +} __packed; + +static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs) +{ + return sizeof(struct exofs_on_disk_inode_layout) + + max_devs * sizeof(__le32); +} + #endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 33c68568b33..09e33193551 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -185,6 +185,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) return container_of(inode, struct exofs_i_info, vfs_inode); } +/* + * Given a layout, object_number and stripe_index return the associated global + * dev_index + */ +unsigned exofs_layout_od_id(struct exofs_layout *layout, + osd_id obj_no, unsigned layout_index); /* * Maximum count of links to a file */ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 03189a958b3..0163546ba05 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -859,6 +859,15 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) return error; } +static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_FILE_LAYOUT, + 0); +static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_DIR_LAYOUT, + 0); + /* * Read an inode from the OSD, and return it as is. We also return the size * attribute in the 'obj_size' argument. @@ -867,11 +876,16 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, struct exofs_fcb *inode, uint64_t *obj_size) { struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_attr attrs[2]; + struct osd_attr attrs[] = { + [0] = g_attr_inode_data, + [1] = g_attr_inode_file_layout, + [2] = g_attr_inode_dir_layout, + [3] = g_attr_logical_length, + }; struct exofs_io_state *ios; + struct exofs_on_disk_inode_layout *layout; int ret; - *obj_size = ~0; ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); @@ -882,8 +896,9 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, exofs_make_credential(oi->i_cred, &ios->obj); ios->cred = oi->i_cred; - attrs[0] = g_attr_inode_data; - attrs[1] = g_attr_logical_length; + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); @@ -900,12 +915,43 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); ret = extract_attr_from_ios(ios, &attrs[1]); + if (ret) { + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + goto out; + } + if (attrs[1].len) { + layout = attrs[1].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported files layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + + ret = extract_attr_from_ios(ios, &attrs[2]); + if (ret) { + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + goto out; + } + if (attrs[2].len) { + layout = attrs[2].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported meta-data layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + + *obj_size = ~0; + ret = extract_attr_from_ios(ios, &attrs[3]); if (ret) { EXOFS_ERR("%s: extract_attr of logical_length failed\n", __func__); goto out; } - *obj_size = get_unaligned_be64(attrs[1].val_ptr); + *obj_size = get_unaligned_be64(attrs[3].val_ptr); out: exofs_put_io_state(ios); diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 4f679317ca5..2b81f99fd62 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -107,6 +107,19 @@ void exofs_put_io_state(struct exofs_io_state *ios) } } +unsigned exofs_layout_od_id(struct exofs_layout *layout, + osd_id obj_no, unsigned layout_index) +{ + return layout_index; +} + +static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, + unsigned layout_index) +{ + return ios->layout->s_ods[ + exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)]; +} + static void _sync_done(struct exofs_io_state *ios, void *p) { struct completion *waiting = p; @@ -242,7 +255,7 @@ int exofs_sbi_create(struct exofs_io_state *ios) for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -266,7 +279,7 @@ int exofs_sbi_remove(struct exofs_io_state *ios) for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -290,7 +303,7 @@ int exofs_sbi_write(struct exofs_io_state *ios) for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -361,7 +374,7 @@ int exofs_sbi_read(struct exofs_io_state *ios) unsigned first_dev = (unsigned)ios->obj.id; first_dev %= ios->layout->s_numdevs; - or = osd_start_request(ios->layout->s_ods[first_dev], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; @@ -442,7 +455,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) for (i = 0; i < sbi->layout.s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; -- cgit v1.2.3-70-g09d2 From 5d952b8391692553c31e620a92d6e09262a9a307 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 1 Feb 2010 13:35:51 +0200 Subject: exofs: RAID0 support We now support striping over mirror devices. Including variable sized stripe_unit. Some limits: * stripe_unit must be a multiple of PAGE_SIZE * stripe_unit * stripe_count is maximum upto 32-bit (4Gb) Tested RAID0 over mirrors, RAID0 only, mirrors only. All check. Design notes: * I'm not using a vectored raid-engine mechanism yet. Following the pnfs-objects-layout data-map structure, "Mirror" is just a private case of "group_width" == 1, and RAID0 is a private case of "Mirrors" == 1. The performance lose of the general case over the particular special case optimization is totally negligible, also considering the extra code size. * In general I added a prepare_stripes() stage that divides the to-be-io pages to the participating devices, the previous exofs_ios_write/read, now becomes _write/read_mirrors and a new write/read upper layer loops on all devices calling _write/read_mirrors. Effectively the prepare_stripes stage is the all secret. Also truncate need fixing to accommodate for striping. * In a RAID0 arrangement, in a regular usage scenario, if all inode layouts will start at the same device, the small files fill up the first device and the later devices stay empty, the farther the device the emptier it is. To fix that, each inode will start at a different stripe_unit, according to it's obj_id modulus number-of-stripe-units. And will then span all stripe-units in the same incrementing order wrapping back to the beginning of the device table. We call it a stripe-units moving window. Special consideration was taken to keep all devices in a mirror arrangement identical. So a broken osd-device could just be cloned from one of the mirrors and no FS scrubbing is needed. (We do that by rotating stripe-unit at a time and not a single device at a time.) TODO: We no longer verify object_length == inode->i_size in exofs_iget. (since i_size is stripped on multiple objects now). I should introduce a multiple-device attribute reading, and use it in exofs_iget. Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 11 ++ fs/exofs/inode.c | 26 +---- fs/exofs/ios.c | 327 ++++++++++++++++++++++++++++++++++++++++++++++--------- fs/exofs/super.c | 52 +++++++-- 4 files changed, 333 insertions(+), 83 deletions(-) (limited to 'fs/exofs/exofs.h') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 09e33193551..0d8a34b21ae 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -58,6 +58,14 @@ struct exofs_layout { osd_id s_pid; /* partition ID of file system*/ + /* Our way of looking at the data_map */ + unsigned stripe_unit; + unsigned mirrors_p1; + + unsigned group_width; + + enum exofs_inode_layout_gen_functions lay_func; + unsigned s_numdevs; /* Num of devices in array */ struct osd_dev *s_ods[0]; /* Variable length */ }; @@ -133,6 +141,9 @@ struct exofs_io_state { struct exofs_per_dev_state { struct osd_request *or; struct bio *bio; + loff_t offset; + unsigned length; + unsigned dev; } per_dev[]; }; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 0163546ba05..2b3163ea56e 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -869,18 +869,17 @@ static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( 0); /* - * Read an inode from the OSD, and return it as is. We also return the size - * attribute in the 'obj_size' argument. + * Read the Linux inode info from the OSD, and return it as is. In exofs the + * inode info is in an application specific page/attribute of the osd-object. */ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, - struct exofs_fcb *inode, uint64_t *obj_size) + struct exofs_fcb *inode) { struct exofs_sb_info *sbi = sb->s_fs_info; struct osd_attr attrs[] = { [0] = g_attr_inode_data, [1] = g_attr_inode_file_layout, [2] = g_attr_inode_dir_layout, - [3] = g_attr_logical_length, }; struct exofs_io_state *ios; struct exofs_on_disk_inode_layout *layout; @@ -944,15 +943,6 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, } } - *obj_size = ~0; - ret = extract_attr_from_ios(ios, &attrs[3]); - if (ret) { - EXOFS_ERR("%s: extract_attr of logical_length failed\n", - __func__); - goto out; - } - *obj_size = get_unaligned_be64(attrs[3].val_ptr); - out: exofs_put_io_state(ios); return ret; @@ -971,7 +961,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) struct exofs_i_info *oi; struct exofs_fcb fcb; struct inode *inode; - uint64_t obj_size; int ret; inode = iget_locked(sb, ino); @@ -983,7 +972,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) __oi_init(oi); /* read the inode from the osd */ - ret = exofs_get_inode(sb, oi, &fcb, &obj_size); + ret = exofs_get_inode(sb, oi, &fcb); if (ret) goto bad_inode; @@ -1004,13 +993,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_blkbits = EXOFS_BLKSHIFT; inode->i_generation = le32_to_cpu(fcb.i_generation); - if ((inode->i_size != obj_size) && - (!exofs_inode_is_fast_symlink(inode))) { - EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n", - inode->i_size, _LLU(obj_size)); - /* FIXME: call exofs_inode_recovery() */ - } - oi->i_dir_start_lookup = 0; if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 2b81f99fd62..6e446b2670b 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -23,6 +23,7 @@ */ #include +#include #include "exofs.h" @@ -110,7 +111,17 @@ void exofs_put_io_state(struct exofs_io_state *ios) unsigned exofs_layout_od_id(struct exofs_layout *layout, osd_id obj_no, unsigned layout_index) { - return layout_index; +/* switch (layout->lay_func) { + case LAYOUT_MOVING_WINDOW: + {*/ + unsigned dev_mod = obj_no; + + return (layout_index + dev_mod * layout->mirrors_p1) % + layout->s_numdevs; +/* } + case LAYOUT_FUNC_IMPLICT: + return layout->devs[layout_index]; + }*/ } static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, @@ -225,8 +236,8 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) _clear_bio(ios->per_dev[i].bio); EXOFS_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", - _LLU(ios->offset), - _LLU(ios->length)); + _LLU(ios->per_dev[i].offset), + _LLU(ios->per_dev[i].length)); continue; /* we recovered */ } @@ -248,6 +259,127 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) return acumulated_lin_err; } +/* REMOVEME: After review + Some quoteing from the standard + + L = logical offset into the file + W = number of data components in a stripe + S = W * stripe_unit (S is Stripe length) + N = L / S (N is the stripe Number) + C = (L-(N*S)) / stripe_unit (C is the component) + O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset) +*/ + +static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset, + u64 *obj_offset, unsigned *dev, unsigned *unit_off) +{ + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned stripe_length = stripe_unit * ios->layout->group_width; + u64 stripe_no = file_offset; + unsigned stripe_mod = do_div(stripe_no, stripe_length); + + *unit_off = stripe_mod % stripe_unit; + *obj_offset = stripe_no * stripe_unit + *unit_off; + *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1; +} + +static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, + struct exofs_per_dev_state *per_dev, int cur_len) +{ + unsigned bv = *cur_bvec; + struct request_queue *q = + osd_request_queue(exofs_ios_od(ios, per_dev->dev)); + + per_dev->length += cur_len; + + if (per_dev->bio == NULL) { + unsigned pages_in_stripe = ios->layout->group_width * + (ios->layout->stripe_unit / PAGE_SIZE); + unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) / + ios->layout->group_width; + + per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); + if (unlikely(!per_dev->bio)) { + EXOFS_DBGMSG("Faild to allocate BIO size=%u\n", + bio_size); + return -ENOMEM; + } + } + + while (cur_len > 0) { + int added_len; + struct bio_vec *bvec = &ios->bio->bi_io_vec[bv]; + + BUG_ON(ios->bio->bi_vcnt <= bv); + cur_len -= bvec->bv_len; + + added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page, + bvec->bv_len, bvec->bv_offset); + if (unlikely(bvec->bv_len != added_len)) + return -ENOMEM; + ++bv; + } + BUG_ON(cur_len); + + *cur_bvec = bv; + return 0; +} + +static int _prepare_for_striping(struct exofs_io_state *ios) +{ + u64 length = ios->length; + u64 offset = ios->offset; + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned comp = 0; + unsigned stripes = 0; + unsigned cur_bvec = 0; + int ret; + + if (!ios->bio) { + if (ios->kern_buff) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + unsigned unit_off; + + _offset_dev_unit_off(ios, offset, &per_dev->offset, + &per_dev->dev, &unit_off); + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (unit_off + length > stripe_unit)); + } + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + while (length) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; + unsigned cur_len; + + if (!per_dev->length) { + unsigned unit_off; + + _offset_dev_unit_off(ios, offset, &per_dev->offset, + &per_dev->dev, &unit_off); + stripes++; + cur_len = min_t(u64, stripe_unit - unit_off, length); + offset += cur_len; + } else { + cur_len = min_t(u64, stripe_unit, length); + } + + ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len); + if (unlikely(ret)) + goto out; + + comp += ios->layout->mirrors_p1; + comp %= ios->layout->s_numdevs; + + length -= cur_len; + } +out: + ios->numdevs = stripes * ios->layout->mirrors_p1; + return ret; +} + int exofs_sbi_create(struct exofs_io_state *ios) { int i, ret; @@ -296,61 +428,71 @@ out: return ret; } -int exofs_sbi_write(struct exofs_io_state *ios) +static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) { - int i, ret; + struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret = 0; - for (i = 0; i < ios->layout->s_numdevs; i++) { + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; goto out; } - ios->per_dev[i].or = or; - ios->numdevs++; + per_dev->or = or; + per_dev->offset = master_dev->offset; if (ios->bio) { struct bio *bio; - if (i != 0) { + if (per_dev != master_dev) { bio = bio_kmalloc(GFP_KERNEL, - ios->bio->bi_max_vecs); + master_dev->bio->bi_max_vecs); if (unlikely(!bio)) { EXOFS_DBGMSG( "Faild to allocate BIO size=%u\n", - ios->bio->bi_max_vecs); + master_dev->bio->bi_max_vecs); ret = -ENOMEM; goto out; } - __bio_clone(bio, ios->bio); + __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; - ios->per_dev[i].bio = bio; + per_dev->length = master_dev->length; + per_dev->bio = bio; + per_dev->dev = dev; } else { - bio = ios->bio; + bio = master_dev->bio; + /* FIXME: bio_set_dir() */ + bio->bi_rw |= (1 << BIO_RW); } - osd_req_write(or, &ios->obj, ios->offset, bio, - ios->length); + osd_req_write(or, &ios->obj, per_dev->offset, bio, + per_dev->length); EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(ios->offset), - _LLU(ios->length), i); + _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(per_dev->length), dev); } else if (ios->kern_buff) { - osd_req_write_kern(or, &ios->obj, ios->offset, + ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, ios->kern_buff, ios->length); + if (unlikely(ret)) + goto out; EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(ios->offset), - _LLU(ios->length), i); + _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(ios->length), dev); } else { osd_req_set_attributes(or, &ios->obj); EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->out_attr_len, i); + _LLU(ios->obj.id), ios->out_attr_len, dev); } if (ios->out_attr) @@ -361,40 +503,57 @@ int exofs_sbi_write(struct exofs_io_state *ios) osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); } - ret = exofs_io_execute(ios); out: return ret; } -int exofs_sbi_read(struct exofs_io_state *ios) +int exofs_sbi_write(struct exofs_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _sbi_write_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = exofs_io_execute(ios); + return ret; +} + +static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) { struct osd_request *or; - struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; unsigned first_dev = (unsigned)ios->obj.id; - first_dev %= ios->layout->s_numdevs; + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; } per_dev->or = or; - ios->numdevs++; if (ios->bio) { - osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length); + osd_req_read(or, &ios->obj, per_dev->offset, + per_dev->bio, per_dev->length); EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" " dev=%d\n", _LLU(ios->obj.id), - _LLU(ios->offset), _LLU(ios->length), + _LLU(per_dev->offset), _LLU(per_dev->length), first_dev); } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, &ios->obj, ios->offset, + int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, ios->kern_buff, ios->length); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d ret=>%d\n", - _LLU(ios->obj.id), _LLU(ios->offset), + _LLU(ios->obj.id), _LLU(per_dev->offset), _LLU(ios->length), first_dev, ret); if (unlikely(ret)) return ret; @@ -403,14 +562,32 @@ int exofs_sbi_read(struct exofs_io_state *ios) EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", _LLU(ios->obj.id), ios->in_attr_len, first_dev); } - if (ios->out_attr) osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); if (ios->in_attr) osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); - return exofs_io_execute(ios); + return 0; +} + +int exofs_sbi_read(struct exofs_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _sbi_read_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = exofs_io_execute(ios); + return ret; } int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) @@ -434,42 +611,84 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) return -EIO; } +static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, + struct osd_attr *attr) +{ + int last_comp = cur_comp + ios->layout->mirrors_p1; + + for (; cur_comp < last_comp; ++cur_comp) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + osd_req_set_attributes(or, &ios->obj); + osd_req_add_set_attr_list(or, attr, 1); + } + + return 0; +} + int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) { struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; struct exofs_io_state *ios; - struct osd_attr attr; - __be64 newsize; + struct exofs_trunc_attr { + struct osd_attr attr; + __be64 newsize; + } *size_attrs; + u64 this_obj_size; + unsigned dev; + unsigned unit_off; int i, ret; - if (exofs_get_io_state(&sbi->layout, &ios)) - return -ENOMEM; + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) + return ret; + + size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), + GFP_KERNEL); + if (unlikely(!size_attrs)) { + ret = -ENOMEM; + goto out; + } ios->obj.id = exofs_oi_objno(oi); ios->cred = oi->i_cred; - newsize = cpu_to_be64(size); - attr = g_attr_logical_length; - attr.val_ptr = &newsize; + ios->numdevs = ios->layout->s_numdevs; + _offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off); - for (i = 0; i < sbi->layout.s_numdevs; i++) { - struct osd_request *or; + for (i = 0; i < ios->layout->group_width; ++i) { + struct exofs_trunc_attr *size_attr = &size_attrs[i]; + u64 obj_size; - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; + if (i < dev) + obj_size = this_obj_size + + ios->layout->stripe_unit - unit_off; + else if (i == dev) + obj_size = this_obj_size; + else /* i > dev */ + obj_size = this_obj_size - unit_off; - osd_req_set_attributes(or, &ios->obj); - osd_req_add_set_attr_list(or, &attr, 1); + size_attr->newsize = cpu_to_be64(obj_size); + size_attr->attr = g_attr_logical_length; + size_attr->attr.val_ptr = &size_attr->newsize; + + ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, + &size_attr->attr); + if (unlikely(ret)) + goto out; } ret = exofs_io_execute(ios); out: + kfree(size_attrs); exofs_put_io_state(ios); return ret; } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index fc8875186ae..8f4e4b37a57 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -308,6 +308,8 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { + u64 stripe_length; + sbi->data_map.odm_num_comps = le32_to_cpu(dt->dt_data_map.cb_num_comps); sbi->data_map.odm_stripe_unit = @@ -321,14 +323,47 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->data_map.odm_raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Hard coded mirror only for now. if not so do not mount */ - if ((sbi->data_map.odm_num_comps != numdevs) || - (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || - (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) || - (sbi->data_map.odm_mirror_cnt != (numdevs - 1))) +/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */ + if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) { + EXOFS_ERR("Group width/depth not supported\n"); return -EINVAL; - else - return 0; + } + if (sbi->data_map.odm_num_comps != numdevs) { + EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", + sbi->data_map.odm_num_comps, numdevs); + return -EINVAL; + } + if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { + EXOFS_ERR("Only RAID_0 for now\n"); + return -EINVAL; + } + if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { + EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", + numdevs, sbi->data_map.odm_mirror_cnt); + return -EINVAL; + } + + stripe_length = sbi->data_map.odm_stripe_unit * + (numdevs / (sbi->data_map.odm_mirror_cnt + 1)); + if (stripe_length >= (1ULL << 32)) { + EXOFS_ERR("Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -EINVAL; + } + + if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { + EXOFS_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); + return -EINVAL; + } + + sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; + sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; + sbi->layout.group_width = sbi->data_map.odm_num_comps / + sbi->layout.mirrors_p1; + + return 0; } /* @odi is valid only as long as @fscb_dev is valid */ @@ -502,6 +537,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) } /* Default layout in case we do not have a device-table */ + sbi->layout.stripe_unit = PAGE_SIZE; + sbi->layout.mirrors_p1 = 1; + sbi->layout.group_width = 1; sbi->layout.s_ods[0] = od; sbi->layout.s_numdevs = 1; sbi->layout.s_pid = opts->pid; -- cgit v1.2.3-70-g09d2 From 86093aaff5be5b214613eb60553e236bdb389c84 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 28 Jan 2010 18:24:06 +0200 Subject: exofs: convert io_state to use pages array instead of bio at input * inode.c operations are full-pages based, and not actually true scatter-gather * Lets us use more pages at once upto 512 (from 249) in 64 bit * Brings us much much closer to be able to use exofs's io_state engine from objlayout driver. (Once I decide where to put the common code) After RAID0 patch the outer (input) bio was never used as a bio, but was simply a page carrier into the raid engine. Even in the simple mirror/single-dev arrangement pages info was copied into a second bio. It is now easer to just pass a pages array into the io_state and prepare bio(s) once. Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 5 +++- fs/exofs/inode.c | 81 ++++++++++++++++++++++++++++---------------------------- fs/exofs/ios.c | 46 ++++++++++++++++++-------------- 3 files changed, 71 insertions(+), 61 deletions(-) (limited to 'fs/exofs/exofs.h') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 0d8a34b21ae..acfebd36de8 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -128,7 +128,10 @@ struct exofs_io_state { loff_t offset; unsigned long length; void *kern_buff; - struct bio *bio; + + struct page **pages; + unsigned nr_pages; + unsigned pgbase; /* Attributes */ unsigned in_attr_len; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2b3163ea56e..6ca0b0117f0 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -41,16 +41,18 @@ enum { BIO_MAX_PAGES_KMALLOC = (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), + MAX_PAGES_KMALLOC = + PAGE_SIZE / sizeof(struct page *), }; struct page_collect { struct exofs_sb_info *sbi; - struct request_queue *req_q; struct inode *inode; unsigned expected_pages; struct exofs_io_state *ios; - struct bio *bio; + struct page **pages; + unsigned alloc_pages; unsigned nr_pages; unsigned long length; loff_t pg_first; /* keep 64bit also in 32-arches */ @@ -62,15 +64,12 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; pcol->sbi = sbi; - /* Create master bios on first Q, later on cloning, each clone will be - * allocated on it's destination Q - */ - pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]); pcol->inode = inode; pcol->expected_pages = expected_pages; pcol->ios = NULL; - pcol->bio = NULL; + pcol->pages = NULL; + pcol->alloc_pages = 0; pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; @@ -80,7 +79,8 @@ static void _pcol_reset(struct page_collect *pcol) { pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); - pcol->bio = NULL; + pcol->pages = NULL; + pcol->alloc_pages = 0; pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; @@ -90,13 +90,13 @@ static void _pcol_reset(struct page_collect *pcol) * it might not end here. don't be left with nothing */ if (!pcol->expected_pages) - pcol->expected_pages = BIO_MAX_PAGES_KMALLOC; + pcol->expected_pages = MAX_PAGES_KMALLOC; } static int pcol_try_alloc(struct page_collect *pcol) { - int pages = min_t(unsigned, pcol->expected_pages, - BIO_MAX_PAGES_KMALLOC); + unsigned pages = min_t(unsigned, pcol->expected_pages, + MAX_PAGES_KMALLOC); if (!pcol->ios) { /* First time allocate io_state */ int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); @@ -105,23 +105,28 @@ static int pcol_try_alloc(struct page_collect *pcol) return ret; } + /* TODO: easily support bio chaining */ + pages = min_t(unsigned, pages, + pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC); + for (; pages; pages >>= 1) { - pcol->bio = bio_kmalloc(GFP_KERNEL, pages); - if (likely(pcol->bio)) + pcol->pages = kmalloc(pages * sizeof(struct page *), + GFP_KERNEL); + if (likely(pcol->pages)) { + pcol->alloc_pages = pages; return 0; + } } - EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n", + EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", pcol->expected_pages); return -ENOMEM; } static void pcol_free(struct page_collect *pcol) { - if (pcol->bio) { - bio_put(pcol->bio); - pcol->bio = NULL; - } + kfree(pcol->pages); + pcol->pages = NULL; if (pcol->ios) { exofs_put_io_state(pcol->ios); @@ -132,11 +137,10 @@ static void pcol_free(struct page_collect *pcol) static int pcol_add_page(struct page_collect *pcol, struct page *page, unsigned len) { - int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); - if (unlikely(len != added_len)) + if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) return -ENOMEM; - ++pcol->nr_pages; + pcol->pages[pcol->nr_pages++] = page; pcol->length += len; return 0; } @@ -181,7 +185,6 @@ static void update_write_page(struct page *page, int ret) */ static int __readpages_done(struct page_collect *pcol, bool do_unlock) { - struct bio_vec *bvec; int i; u64 resid; u64 good_bytes; @@ -198,8 +201,8 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); - __bio_for_each_segment(bvec, pcol->bio, i, 0) { - struct page *page = bvec->bv_page; + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; struct inode *inode = page->mapping->host; int page_stat; @@ -218,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) ret = update_read_page(page, page_stat); if (do_unlock) unlock_page(page); - length += bvec->bv_len; + length += PAGE_SIZE; } pcol_free(pcol); @@ -238,11 +241,10 @@ static void readpages_done(struct exofs_io_state *ios, void *p) static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) { - struct bio_vec *bvec; int i; - __bio_for_each_segment(bvec, pcol->bio, i, 0) { - struct page *page = bvec->bv_page; + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; if (rw == READ) update_read_page(page, ret); @@ -260,13 +262,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync) struct page_collect *pcol_copy = NULL; int ret; - if (!pcol->bio) + if (!pcol->pages) return 0; /* see comment in _readpage() about sync reads */ WARN_ON(is_sync && (pcol->nr_pages != 1)); - ios->bio = pcol->bio; + ios->pages = pcol->pages; + ios->nr_pages = pcol->nr_pages; ios->length = pcol->length; ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; @@ -366,7 +369,7 @@ try_again: goto try_again; } - if (!pcol->bio) { + if (!pcol->pages) { ret = pcol_try_alloc(pcol); if (unlikely(ret)) goto fail; @@ -448,7 +451,6 @@ static int exofs_readpage(struct file *file, struct page *page) static void writepages_done(struct exofs_io_state *ios, void *p) { struct page_collect *pcol = p; - struct bio_vec *bvec; int i; u64 resid; u64 good_bytes; @@ -467,8 +469,8 @@ static void writepages_done(struct exofs_io_state *ios, void *p) pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); - __bio_for_each_segment(bvec, pcol->bio, i, 0) { - struct page *page = bvec->bv_page; + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; struct inode *inode = page->mapping->host; int page_stat; @@ -485,7 +487,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p) EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", inode->i_ino, page->index, page_stat); - length += bvec->bv_len; + length += PAGE_SIZE; } pcol_free(pcol); @@ -500,7 +502,7 @@ static int write_exec(struct page_collect *pcol) struct page_collect *pcol_copy = NULL; int ret; - if (!pcol->bio) + if (!pcol->pages) return 0; pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); @@ -512,9 +514,8 @@ static int write_exec(struct page_collect *pcol) *pcol_copy = *pcol; - pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ - - ios->bio = pcol_copy->bio; + ios->pages = pcol_copy->pages; + ios->nr_pages = pcol_copy->nr_pages; ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; ios->length = pcol_copy->length; ios->done = writepages_done; @@ -605,7 +606,7 @@ try_again: goto try_again; } - if (!pcol->bio) { + if (!pcol->pages) { ret = pcol_try_alloc(pcol); if (unlikely(ret)) goto fail; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 6e446b2670b..263052c77f4 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -283,10 +283,11 @@ static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset, *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1; } -static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, - struct exofs_per_dev_state *per_dev, int cur_len) +static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct exofs_per_dev_state *per_dev, + int cur_len) { - unsigned bv = *cur_bvec; + unsigned pg = *cur_pg; struct request_queue *q = osd_request_queue(exofs_ios_od(ios, per_dev->dev)); @@ -295,7 +296,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) / + unsigned bio_size = (ios->nr_pages + pages_in_stripe) / ios->layout->group_width; per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); @@ -307,21 +308,22 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, } while (cur_len > 0) { - int added_len; - struct bio_vec *bvec = &ios->bio->bi_io_vec[bv]; + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; - BUG_ON(ios->bio->bi_vcnt <= bv); - cur_len -= bvec->bv_len; + BUG_ON(ios->nr_pages <= pg); + cur_len -= pglen; - added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page, - bvec->bv_len, bvec->bv_offset); - if (unlikely(bvec->bv_len != added_len)) + added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], + pglen, pgbase); + if (unlikely(pglen != added_len)) return -ENOMEM; - ++bv; + pgbase = 0; + ++pg; } BUG_ON(cur_len); - *cur_bvec = bv; + *cur_pg = pg; return 0; } @@ -332,10 +334,10 @@ static int _prepare_for_striping(struct exofs_io_state *ios) unsigned stripe_unit = ios->layout->stripe_unit; unsigned comp = 0; unsigned stripes = 0; - unsigned cur_bvec = 0; - int ret; + unsigned cur_pg = 0; + int ret = 0; - if (!ios->bio) { + if (!ios->pages) { if (ios->kern_buff) { struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; unsigned unit_off; @@ -352,7 +354,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios) while (length) { struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; - unsigned cur_len; + unsigned cur_len, page_off; if (!per_dev->length) { unsigned unit_off; @@ -362,11 +364,15 @@ static int _prepare_for_striping(struct exofs_io_state *ios) stripes++; cur_len = min_t(u64, stripe_unit - unit_off, length); offset += cur_len; + page_off = unit_off & ~PAGE_MASK; + BUG_ON(page_off != ios->pgbase); } else { cur_len = min_t(u64, stripe_unit, length); + page_off = 0; } - ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len); + ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, + cur_len); if (unlikely(ret)) goto out; @@ -448,7 +454,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) per_dev->or = or; per_dev->offset = master_dev->offset; - if (ios->bio) { + if (ios->pages) { struct bio *bio; if (per_dev != master_dev) { @@ -541,7 +547,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) } per_dev->or = or; - if (ios->bio) { + if (ios->pages) { osd_req_read(or, &ios->obj, per_dev->offset, per_dev->bio, per_dev->length); EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" -- cgit v1.2.3-70-g09d2 From 50a76fd3c352ed2740eba01512efcfceee0703be Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 11 Feb 2010 13:01:39 +0200 Subject: exofs: groups support * _calc_stripe_info() changes to accommodate for grouping calculations. Returns additional information * old _prepare_pages() becomes _prepare_one_group() which stores pages belonging to one device group. * New _prepare_for_striping iterates on all groups calling _prepare_one_group(). * Enable mounting of groups data_maps (group_width != 0) [QUESTION] what is faster A or B; A. x += stride; x = x % width + first_x; B x += stride if (x < last_x) x = first_x; Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 3 ++ fs/exofs/ios.c | 129 +++++++++++++++++++++++++++++++++++++++++++++---------- fs/exofs/super.c | 46 ++++++++++++++------ 3 files changed, 141 insertions(+), 37 deletions(-) (limited to 'fs/exofs/exofs.h') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index acfebd36de8..59b8bf2825c 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -63,6 +63,8 @@ struct exofs_layout { unsigned mirrors_p1; unsigned group_width; + u64 group_depth; + unsigned group_count; enum exofs_inode_layout_gen_functions lay_func; @@ -132,6 +134,7 @@ struct exofs_io_state { struct page **pages; unsigned nr_pages; unsigned pgbase; + unsigned pages_consumed; /* Attributes */ unsigned in_attr_len; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index d28febdf54a..5293bc411d1 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -262,25 +262,50 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) /* * L - logical offset into the file * - * U - The number of bytes in a full stripe + * U - The number of bytes in a stripe within a group * * U = stripe_unit * group_width * - * N - The stripe number + * T - The number of bytes striped within a group of component objects + * (before advancing to the next group) * - * N = L / U + * T = stripe_unit * group_width * group_depth + * + * S - The number of bytes striped across all component objects + * before the pattern repeats + * + * S = stripe_unit * group_width * group_depth * group_count + * + * M - The "major" (i.e., across all components) stripe number + * + * M = L / S + * + * G - Counts the groups from the beginning of the major stripe + * + * G = (L - (M * S)) / T [or (L % S) / T] + * + * H - The byte offset within the group + * + * H = (L - (M * S)) % T [or (L % S) % T] + * + * N - The "minor" (i.e., across the group) stripe number + * + * N = H / U * * C - The component index coresponding to L * - * C = (L - (N*U)) / stripe_unit + * C = (H - (N * U)) / stripe_unit + G * group_width + * [or (L % U) / stripe_unit + G * group_width] * * O - The component offset coresponding to L * - * (N*stripe_unit)+(L%stripe_unit) + * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit */ - struct _striping_info { u64 obj_offset; + u64 group_length; + u64 total_group_length; + u64 Major; unsigned dev; unsigned unit_off; }; @@ -290,15 +315,35 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, { u32 stripe_unit = ios->layout->stripe_unit; u32 group_width = ios->layout->group_width; + u64 group_depth = ios->layout->group_depth; + u32 U = stripe_unit * group_width; + u64 T = U * group_depth; + u64 S = T * ios->layout->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodS = file_offset - M * S; + u32 G = div64_u64(LmodS, T); + u64 H = LmodS - G * T; + + u32 N = div_u64(H, U); + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; + si->dev *= ios->layout->mirrors_p1; - u32 LmodU; - u64 N = div_u64_rem(file_offset, U, &LmodU); + div_u64_rem(file_offset, stripe_unit, &si->unit_off); - si->unit_off = LmodU % stripe_unit; - si->obj_offset = N * stripe_unit + si->unit_off; - si->dev = LmodU / stripe_unit; - si->dev *= ios->layout->mirrors_p1; + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + + si->group_length = T - H; + si->total_group_length = T; + si->Major = M; } static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, @@ -345,16 +390,17 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, return 0; } -static int _prepare_pages(struct exofs_io_state *ios, - struct _striping_info *si) +static int _prepare_one_group(struct exofs_io_state *ios, u64 length, + struct _striping_info *si, unsigned first_comp) { - u64 length = ios->length; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned devs_in_group = ios->layout->group_width * mirrors_p1; unsigned dev = si->dev; - unsigned comp = 0; - unsigned stripes = 0; - unsigned cur_pg = 0; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned comp = first_comp + (dev - first_dev); + unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned cur_pg = ios->pages_consumed; int ret = 0; while (length) { @@ -377,10 +423,11 @@ static int _prepare_pages(struct exofs_io_state *ios, cur_len = stripe_unit; } - stripes++; + if (max_comp < comp) + max_comp = comp; dev += mirrors_p1; - dev %= ios->layout->s_numdevs; + dev = (dev % devs_in_group) + first_dev; } else { cur_len = stripe_unit; } @@ -393,18 +440,24 @@ static int _prepare_pages(struct exofs_io_state *ios, goto out; comp += mirrors_p1; - comp %= ios->layout->s_numdevs; + comp = (comp % devs_in_group) + first_comp; length -= cur_len; } out: - ios->numdevs = stripes * mirrors_p1; + ios->numdevs = max_comp + mirrors_p1; + ios->pages_consumed = cur_pg; return ret; } static int _prepare_for_striping(struct exofs_io_state *ios) { + u64 length = ios->length; struct _striping_info si; + unsigned devs_in_group = ios->layout->group_width * + ios->layout->mirrors_p1; + unsigned first_comp = 0; + int ret = 0; _calc_stripe_info(ios, ios->offset, &si); @@ -424,7 +477,31 @@ static int _prepare_for_striping(struct exofs_io_state *ios) return 0; } - return _prepare_pages(ios, &si); + while (length) { + if (length < si.group_length) + si.group_length = length; + + ret = _prepare_one_group(ios, si.group_length, &si, first_comp); + if (unlikely(ret)) + goto out; + + length -= si.group_length; + + si.group_length = si.total_group_length; + si.unit_off = 0; + ++si.Major; + si.obj_offset = si.Major * ios->layout->stripe_unit * + ios->layout->group_depth; + + si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; + si.dev %= ios->layout->s_numdevs; + + first_comp += devs_in_group; + first_comp %= ios->layout->s_numdevs; + } + +out: + return ret; } int exofs_sbi_create(struct exofs_io_state *ios) @@ -482,6 +559,9 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) unsigned last_comp = cur_comp + ios->layout->mirrors_p1; int ret = 0; + if (ios->pages && !master_dev->length) + return 0; /* Just an empty slot */ + for (; cur_comp < last_comp; ++cur_comp, ++dev) { struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; @@ -580,6 +660,9 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; unsigned first_dev = (unsigned)ios->obj.id; + if (ios->pages && !per_dev->length) + return 0; /* Just an empty slot */ + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 8f4e4b37a57..6cf5e4e84d6 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -323,11 +323,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->data_map.odm_raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */ - if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) { - EXOFS_ERR("Group width/depth not supported\n"); - return -EINVAL; - } +/* FIXME: Only raid0 for now. if not so, do not mount */ if (sbi->data_map.odm_num_comps != numdevs) { EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", sbi->data_map.odm_num_comps, numdevs); @@ -343,14 +339,6 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, return -EINVAL; } - stripe_length = sbi->data_map.odm_stripe_unit * - (numdevs / (sbi->data_map.odm_mirror_cnt + 1)); - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { EXOFS_ERR("Stripe Unit(0x%llx)" " must be Multples of PAGE_SIZE(0x%lx)\n", @@ -360,8 +348,36 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; - sbi->layout.group_width = sbi->data_map.odm_num_comps / + + if (sbi->data_map.odm_group_width) { + sbi->layout.group_width = sbi->data_map.odm_group_width; + sbi->layout.group_depth = sbi->data_map.odm_group_depth; + if (!sbi->layout.group_depth) { + EXOFS_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + sbi->layout.group_count = sbi->data_map.odm_num_comps / + sbi->layout.mirrors_p1 / + sbi->data_map.odm_group_width; + } else { + if (sbi->data_map.odm_group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %d\n", + sbi->data_map.odm_group_depth); + sbi->data_map.odm_group_depth = 0; + } + sbi->layout.group_width = sbi->data_map.odm_num_comps / sbi->layout.mirrors_p1; + sbi->layout.group_depth = -1; + sbi->layout.group_count = 1; + } + + stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; + if (stripe_length >= (1ULL << 32)) { + EXOFS_ERR("Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -EINVAL; + } return 0; } @@ -540,6 +556,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->layout.stripe_unit = PAGE_SIZE; sbi->layout.mirrors_p1 = 1; sbi->layout.group_width = 1; + sbi->layout.group_depth = -1; + sbi->layout.group_count = 1; sbi->layout.s_ods[0] = od; sbi->layout.s_numdevs = 1; sbi->layout.s_pid = opts->pid; -- cgit v1.2.3-70-g09d2 From a9185b41a4f84971b930c519f0c63bd450c4810d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Mar 2010 09:21:37 +0100 Subject: pass writeback_control to ->write_inode This gives the filesystem more information about the writeback that is happening. Trond requested this for the NFS unstable write handling, and other filesystems might benefit from this too by beeing able to distinguish between the different callers in more detail. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/adfs/adfs.h | 2 +- fs/adfs/inode.c | 5 +++-- fs/affs/affs.h | 3 ++- fs/affs/inode.c | 2 +- fs/bfs/inode.c | 5 +++-- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 4 ++-- fs/exofs/exofs.h | 2 +- fs/exofs/inode.c | 4 ++-- fs/ext2/ext2.h | 2 +- fs/ext2/inode.c | 11 +++++++++-- fs/ext3/inode.c | 4 ++-- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 6 +++--- fs/fat/inode.c | 9 +++++++-- fs/fs-writeback.c | 11 +++++------ fs/gfs2/super.c | 5 +++-- fs/hfs/hfs_fs.h | 2 +- fs/hfs/inode.c | 2 +- fs/hfsplus/super.c | 3 ++- fs/jfs/inode.c | 5 ++++- fs/jfs/jfs_inode.h | 2 +- fs/minix/inode.c | 8 +++++--- fs/nfs/inode.c | 5 +++-- fs/nfs/internal.h | 2 +- fs/ntfs/dir.c | 2 +- fs/ntfs/file.c | 2 +- fs/ntfs/inode.c | 2 +- fs/ntfs/inode.h | 4 ++-- fs/ntfs/super.c | 8 ++++++++ fs/omfs/inode.c | 10 ++++++++-- fs/reiserfs/inode.c | 4 ++-- fs/sysv/inode.c | 10 ++++++++-- fs/sysv/sysv.h | 2 +- fs/ubifs/dir.c | 2 +- fs/ubifs/file.c | 8 ++++---- fs/ubifs/super.c | 2 +- fs/udf/inode.c | 4 ++-- fs/udf/udfdecl.h | 2 +- fs/ufs/inode.c | 5 +++-- fs/ufs/ufs.h | 2 +- fs/xfs/linux-2.6/xfs_super.c | 4 ++-- include/linux/ext3_fs.h | 2 +- include/linux/fs.h | 2 +- include/linux/reiserfs_fs.h | 2 +- 45 files changed, 115 insertions(+), 72 deletions(-) (limited to 'fs/exofs/exofs.h') diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 9cc18775b83..2ff622f6f54 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -121,7 +121,7 @@ struct adfs_discmap { /* Inode stuff */ struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); -int adfs_write_inode(struct inode *inode,int unused); +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); int adfs_notify_change(struct dentry *dentry, struct iattr *attr); /* map.c */ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 3f57ce4bee5..0f5e3097813 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -9,6 +9,7 @@ */ #include #include +#include #include "adfs.h" /* @@ -360,7 +361,7 @@ out: * The adfs-specific inode data has already been updated by * adfs_notify_change() */ -int adfs_write_inode(struct inode *inode, int wait) +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct object_info obj; @@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj, wait); + ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; } diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 0e40caaba45..861dae68ac1 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -175,7 +175,8 @@ extern void affs_delete_inode(struct inode *inode); extern void affs_clear_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, unsigned long ino); -extern int affs_write_inode(struct inode *inode, int); +extern int affs_write_inode(struct inode *inode, + struct writeback_control *wbc); extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); /* file.c */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 3c4ec7d864c..c9744d771d9 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -166,7 +166,7 @@ bad_inode: } int -affs_write_inode(struct inode *inode, int unused) +affs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct buffer_head *bh; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 8f3d9fd8960..f22a7d3dc36 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "bfs.h" @@ -98,7 +99,7 @@ error: return ERR_PTR(-EIO); } -static int bfs_write_inode(struct inode *inode, int wait) +static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; @@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait) di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); mark_buffer_dirty(bh); - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) err = -EIO; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2aa8ec6a098..8b5cfdd4bfc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2326,7 +2326,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); -int btrfs_write_inode(struct inode *inode, int wait); +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4deb280f896..c41db6d45ab 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3968,7 +3968,7 @@ err: return ret; } -int btrfs_write_inode(struct inode *inode, int wait) +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -3977,7 +3977,7 @@ int btrfs_write_inode(struct inode *inode, int wait) if (root->fs_info->btree_inode == inode) return 0; - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); ret = btrfs_commit_transaction(trans, root); diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 59b8bf2825c..8442e353309 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -261,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata); extern struct inode *exofs_iget(struct super_block *, unsigned long); struct inode *exofs_new_inode(struct inode *, int); -extern int exofs_write_inode(struct inode *, int); +extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); extern void exofs_delete_inode(struct inode *); /* dir.c: */ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 5514f3c2c2f..a17e4b733e3 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1280,9 +1280,9 @@ out: return ret; } -int exofs_write_inode(struct inode *inode, int wait) +int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return exofs_update_inode(inode, wait); + return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } /* diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 061914add3c..0b038e47ad2 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); /* inode.c */ extern struct inode *ext2_iget (struct super_block *, unsigned long); -extern int ext2_write_inode (struct inode *, int); +extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_delete_inode (struct inode *); extern int ext2_sync_inode (struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 71b032c65a0..36ae1cac767 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others"); MODULE_DESCRIPTION("Second Extended Filesystem"); MODULE_LICENSE("GPL"); +static int __ext2_write_inode(struct inode *inode, int do_sync); + /* * Test whether an inode is a fast symlink. */ @@ -64,7 +66,7 @@ void ext2_delete_inode (struct inode * inode) goto no_delete; EXT2_I(inode)->i_dtime = get_seconds(); mark_inode_dirty(inode); - ext2_write_inode(inode, inode_needs_sync(inode)); + __ext2_write_inode(inode, inode_needs_sync(inode)); inode->i_size = 0; if (inode->i_blocks) @@ -1335,7 +1337,7 @@ bad_inode: return ERR_PTR(ret); } -int ext2_write_inode(struct inode *inode, int do_sync) +static int __ext2_write_inode(struct inode *inode, int do_sync) { struct ext2_inode_info *ei = EXT2_I(inode); struct super_block *sb = inode->i_sb; @@ -1440,6 +1442,11 @@ int ext2_write_inode(struct inode *inode, int do_sync) return err; } +int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int ext2_sync_inode(struct inode *inode) { struct writeback_control wbc = { diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 455e6e6e5cb..7aca55fcc97 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3096,7 +3096,7 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -int ext3_write_inode(struct inode *inode, int wait) +int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { if (current->flags & PF_MEMALLOC) return 0; @@ -3107,7 +3107,7 @@ int ext3_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + if (wbc->sync_mode != WB_SYNC_ALL) return 0; return ext3_force_commit(inode->i_sb); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4cedc91ec59..50af1a2c65e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1416,7 +1416,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern int ext4_write_inode(struct inode *, int); +extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e11952404e0..d01a6cdbf85 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5177,7 +5177,7 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -int ext4_write_inode(struct inode *inode, int wait) +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; @@ -5191,7 +5191,7 @@ int ext4_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + if (wbc->sync_mode != WB_SYNC_ALL) return 0; err = ext4_force_commit(inode->i_sb); @@ -5201,7 +5201,7 @@ int ext4_write_inode(struct inode *inode, int wait) err = ext4_get_inode_loc(inode, &iloc); if (err) return err; - if (wait) + if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error(inode->i_sb, __func__, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 14da530b05c..fbeecdc194d 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, return i_pos; } -static int fat_write_inode(struct inode *inode, int wait) +static int __fat_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -634,9 +634,14 @@ retry: return err; } +static int fat_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int fat_sync_inode(struct inode *inode) { - return fat_write_inode(inode, 1); + return __fat_write_inode(inode, 1); } EXPORT_SYMBOL_GPL(fat_sync_inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5f2721b1e4b..76fc4d594ac 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -381,10 +381,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -static int write_inode(struct inode *inode, int sync) +static int write_inode(struct inode *inode, struct writeback_control *wbc) { if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); + return inode->i_sb->s_op->write_inode(inode, wbc); return 0; } @@ -421,7 +421,6 @@ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; - int wait = wbc->sync_mode == WB_SYNC_ALL; unsigned dirty; int ret; @@ -439,7 +438,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * We'll have another go at writing back this inode when we * completed a full scan of b_io. */ - if (!wait) { + if (wbc->sync_mode != WB_SYNC_ALL) { requeue_io(inode); return 0; } @@ -466,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * This is important for filesystems that modify metadata on data * I/O completion. */ - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; @@ -474,7 +473,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - int err = write_inode(inode, wait); + int err = write_inode(inode, wbc); if (ret == 0) ret = err; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e5e22629da6..ca87598ead7 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -711,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) * Returns: errno */ -static int gfs2_write_inode(struct inode *inode, int sync) +static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -745,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync) do_unlock: gfs2_glock_dq_uninit(&gh); do_flush: - if (sync != 0) + if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); return ret; } diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 052387e1167..fe35e3b626c 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops; extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); -extern int hfs_write_inode(struct inode *, int); +extern int hfs_write_inode(struct inode *, struct writeback_control *); extern int hfs_inode_setattr(struct dentry *, struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a1cbff2b4d9..14f5cb1b9fd 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, HFS_SB(inode->i_sb)->alloc_blksz); } -int hfs_write_inode(struct inode *inode, int unused) +int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct inode *main_inode = inode; struct hfs_find_data fd; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 43022f3d514..74b473a8ef9 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -87,7 +87,8 @@ bad_inode: return ERR_PTR(err); } -static int hfsplus_write_inode(struct inode *inode, int unused) +static int hfsplus_write_inode(struct inode *inode, + struct writeback_control *wbc) { struct hfsplus_vh *vhdr; int ret = 0; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b2ae190a77b..182b78cc3e6 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" @@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait) return rc; } -int jfs_write_inode(struct inode *inode, int wait) +int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) { + int wait = wbc->sync_mode == WB_SYNC_ALL; + if (test_cflag(COMMIT_Nolink, inode)) return 0; /* diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 1eff7db34d6..15902b03c2a 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); extern int jfs_commit_inode(struct inode *, int); -extern int jfs_write_inode(struct inode*, int); +extern int jfs_write_inode(struct inode *, struct writeback_control *); extern void jfs_delete_inode(struct inode *); extern void jfs_dirty_inode(struct inode *); extern void jfs_truncate(struct inode *); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 74ea82d7216..756f8c93780 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -17,8 +17,10 @@ #include #include #include +#include -static int minix_write_inode(struct inode * inode, int wait); +static int minix_write_inode(struct inode *inode, + struct writeback_control *wbc); static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); static int minix_remount (struct super_block * sb, int * flags, char * data); @@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) return bh; } -static int minix_write_inode(struct inode *inode, int wait) +static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct buffer_head *bh; @@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait) bh = V2_minix_update_inode(inode); if (!bh) return -EIO; - if (wait && buffer_dirty(bh)) { + if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { printk("IO error syncing minix inode [%s:%08lx]\n", diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5ecd952cae1..7f9ecc46f3f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,11 +97,12 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - ret = nfs_commit_inode(inode, sync ? FLUSH_SYNC : 0); + ret = nfs_commit_inode(inode, + wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0); if (ret >= 0) return 0; __mark_inode_dirty(inode, I_DIRTY_DATASYNC); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 29e464d23b3..11f82f03c5d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); +extern int nfs_write_inode(struct inode *, struct writeback_control *); extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 5a9e34475e3..9173e82a45d 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1545,7 +1545,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, write_inode_now(bmp_vi, !datasync); iput(bmp_vi); } - ret = ntfs_write_inode(vi, 1); + ret = __ntfs_write_inode(vi, 1); write_inode_now(vi, !datasync); err = sync_blockdev(vi->i_sb->s_bdev); if (unlikely(err && !ret)) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 43179ddd336..b681c71d706 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2182,7 +2182,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) - ret = ntfs_write_inode(vi, 1); + ret = __ntfs_write_inode(vi, 1); write_inode_now(vi, !datasync); /* * NOTE: If we were to use mapping->private_list (see ext2 and diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index dc2505abb6d..4b57fb1eac2 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2957,7 +2957,7 @@ out: * * Return 0 on success and -errno on error. */ -int ntfs_write_inode(struct inode *vi, int sync) +int __ntfs_write_inode(struct inode *vi, int sync) { sle64 nt; ntfs_inode *ni = NTFS_I(vi); diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 117eaf8032a..9a113544605 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi); extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); -extern int ntfs_write_inode(struct inode *vi, int sync); +extern int __ntfs_write_inode(struct inode *vi, int sync); static inline void ntfs_commit_inode(struct inode *vi) { if (!is_bad_inode(vi)) - ntfs_write_inode(vi, 1); + __ntfs_write_inode(vi, 1); return; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 80b04770e8e..1cf39dfaee7 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -39,6 +39,7 @@ #include "dir.h" #include "debug.h" #include "index.h" +#include "inode.h" #include "aops.h" #include "layout.h" #include "malloc.h" @@ -2662,6 +2663,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) return 0; } +#ifdef NTFS_RW +static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) +{ + return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); +} +#endif + /** * The complete super operations. */ diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index f3b7c1541f3..75d9b5ba1d4 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "omfs.h" @@ -89,7 +90,7 @@ static void omfs_update_checksums(struct omfs_inode *oi) oi->i_head.h_check_xor = xor; } -static int omfs_write_inode(struct inode *inode, int wait) +static int __omfs_write_inode(struct inode *inode, int wait) { struct omfs_inode *oi; struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); @@ -162,9 +163,14 @@ out: return ret; } +static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int omfs_sync_inode(struct inode *inode) { - return omfs_write_inode(inode, 1); + return __omfs_write_inode(inode, 1); } /* diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2df0f5c7c60..0d651f980a8 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1615,7 +1615,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, ** to properly mark inodes for datasync and such, but only actually ** does something when called for a synchronous update. */ -int reiserfs_write_inode(struct inode *inode, int do_sync) +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct reiserfs_transaction_handle th; int jbegin_count = 1; @@ -1627,7 +1627,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync) ** inode needs to reach disk for safety, and they can safely be ** ignored because the altered inode has already been logged. */ - if (do_sync && !(current->flags & PF_MEMALLOC)) { + if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { reiserfs_write_lock(inode->i_sb); if (!journal_begin(&th, inode->i_sb, jbegin_count)) { reiserfs_update_sd(&th, inode); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 9824743832a..4573734d723 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include "sysv.h" @@ -246,7 +247,7 @@ bad_inode: return ERR_PTR(-EIO); } -int sysv_write_inode(struct inode *inode, int wait) +static int __sysv_write_inode(struct inode *inode, int wait) { struct super_block * sb = inode->i_sb; struct sysv_sb_info * sbi = SYSV_SB(sb); @@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait) return 0; } +int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int sysv_sync_inode(struct inode *inode) { - return sysv_write_inode(inode, 1); + return __sysv_write_inode(inode, 1); } static void sysv_delete_inode(struct inode *inode) diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 53786eb5cf6..94cb9b4d76c 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping, /* inode.c */ extern struct inode *sysv_iget(struct super_block *, unsigned int); -extern int sysv_write_inode(struct inode *, int); +extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 552fb0111ff..401e503d44a 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, 1); + err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); return err; out_cancel: diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 16a6444330e..e26c02ab6cd 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1011,7 +1011,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) /* Is the page fully inside @i_size? */ if (page->index < end_index) { if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; /* @@ -1039,7 +1039,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) kunmap_atomic(kaddr, KM_USER0); if (i_size > synced_i_size) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; } @@ -1242,7 +1242,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (release) ubifs_release_budget(c, &req); if (IS_SYNC(inode)) - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); return err; out: @@ -1316,7 +1316,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) * the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 43f9d19a6f3..4d2f2157dd3 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode) /* * Note, Linux write-back code calls this without 'i_mutex'. */ -static int ubifs_write_inode(struct inode *inode, int wait) +static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 378a7592257..b0208924729 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1373,12 +1373,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe) return mode; } -int udf_write_inode(struct inode *inode, int sync) +int udf_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; lock_kernel(); - ret = udf_update_inode(inode, sync); + ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 8d46f4294ee..4223ac855da 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *); extern void udf_read_inode(struct inode *); extern void udf_delete_inode(struct inode *); extern void udf_clear_inode(struct inode *); -extern int udf_write_inode(struct inode *, int); +extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); extern int udf_extend_file(struct inode *, struct extent_position *, struct kernel_long_ad *, sector_t); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7cf33379fd4..0a627e08610 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "ufs_fs.h" #include "ufs.h" @@ -890,11 +891,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync) return 0; } -int ufs_write_inode (struct inode * inode, int wait) +int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; lock_kernel(); - ret = ufs_update_inode (inode, wait); + ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 01d0e2a3b23..43f9f5d5670 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int); /* inode.c */ extern struct inode *ufs_iget(struct super_block *, unsigned long); -extern int ufs_write_inode (struct inode *, int); +extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_delete_inode (struct inode *); extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 8f117db6070..71345a370d9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1063,7 +1063,7 @@ xfs_log_inode( STATIC int xfs_fs_write_inode( struct inode *inode, - int sync) + struct writeback_control *wbc) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1074,7 +1074,7 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (sync) { + if (wbc->sync_mode == WB_SYNC_ALL) { /* * Make sure the inode has hit stable storage. By using the * log and the fsync transactions we reduce the IOs we have diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 6b049030fbe..deac2566450 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -877,7 +877,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, int create); extern struct inode *ext3_iget(struct super_block *, unsigned long); -extern int ext3_write_inode (struct inode *, int); +extern int ext3_write_inode (struct inode *, struct writeback_control *); extern int ext3_setattr (struct dentry *, struct iattr *); extern void ext3_delete_inode (struct inode *); extern int ext3_sync_inode (handle_t *, struct inode *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b3182c7eb5..45689621a85 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1557,7 +1557,7 @@ struct super_operations { void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *); - int (*write_inode) (struct inode *, int); + int (*write_inode) (struct inode *, struct writeback_control *wbc); void (*drop_inode) (struct inode *); void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 1ba3cf6edfb..3b603f47418 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -2034,7 +2034,7 @@ void reiserfs_read_locked_inode(struct inode *inode, int reiserfs_find_actor(struct inode *inode, void *p); int reiserfs_init_locked_inode(struct inode *inode, void *p); void reiserfs_delete_inode(struct inode *inode); -int reiserfs_write_inode(struct inode *inode, int); +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc); int reiserfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create); struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, -- cgit v1.2.3-70-g09d2