From 0e0243dc35a2349b3946e54f90e874be396fdb8b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 12 Jan 2012 10:06:05 +0300 Subject: NFS: add an endian notation This function returns a big endian value. The implementation in fs/nfs/callback_proc.c is declared with "__be32" but the .h file uses "unsigned" instead. It makes sparse complain: fs/nfs/callback_proc.c:232:8: error: symbol 'nfs4_callback_layoutrecall' redeclared with different type (originally declared at fs/nfs/callback.h:165) - different base types Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/callback.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 07df5f1d85e..c89d3b9e483 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -162,7 +162,7 @@ struct cb_layoutrecallargs { }; }; -extern unsigned nfs4_callback_layoutrecall( +extern __be32 nfs4_callback_layoutrecall( struct cb_layoutrecallargs *args, void *dummy, struct cb_process_state *cps); -- cgit v1.2.3-70-g09d2 From 13fff2f35fd21d69ee84ef6a78610420e1a42818 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 12 Jan 2012 10:07:35 +0300 Subject: NFS: cleanup endian type in decode_ds_addr() port is supposed to be a __be16 here. The existing code should work fine, but this is a cleanup. Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayoutdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index ed388aae968..8ae91908f5a 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -382,7 +382,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags) { struct nfs4_pnfs_ds_addr *da = NULL; char *buf, *portstr; - u32 port; + __be16 port; int nlen, rlen; int tmp[2]; __be32 *p; -- cgit v1.2.3-70-g09d2 From 363e0df057ea8da539645fe4c3c227e3d44054cc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 12 Jan 2012 10:16:14 +0300 Subject: nfs: check for integer overflow in decode_devicenotify_args() On 32 bit, if n is too large then "n * sizeof(*args->devs)" could overflow and args->devs would be smaller than expected. Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 726e59a9e50..d50b2742f23 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -305,6 +305,10 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, n = ntohl(*p++); if (n <= 0) goto out; + if (n > ULONG_MAX / sizeof(*args->devs)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); if (!args->devs) { -- cgit v1.2.3-70-g09d2 From de040beccd52bb5fcac90031505384d037b1111c Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 10 Jan 2012 22:42:47 +0800 Subject: NFS4: fix compile warnings in nfs4proc.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit compile in nfs-for-3.3 branch shows following warnings. Fix it here. fs/nfs/nfs4proc.c: In function ‘__nfs4_get_acl_uncached’: fs/nfs/nfs4proc.c:3589: warning: format ‘%ld’ expects type ‘long int’, but argument 4 has type ‘size_t’ fs/nfs/nfs4proc.c:3589: warning: format ‘%ld’ expects type ‘long int’, but argument 6 has type ‘size_t’ Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 75366dc8968..f0c849c98fe 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3587,7 +3587,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu res.acl_flags |= NFS4_ACL_LEN_REQUEST; resp_buf = page_address(pages[0]); - dprintk("%s buf %p buflen %ld npages %d args.acl_len %ld\n", + dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n", __func__, buf, buflen, npages, args.acl_len); ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); -- cgit v1.2.3-70-g09d2 From 39e567ae36fe03c2b446e1b83ee3d39bea08f90b Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 12 Jan 2012 23:18:41 +0800 Subject: pnfsblock: acquire im_lock in _preload_range When calling _add_entry, we should take the im_lock to protect agains other modifiers. Cc: #3.1+ Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/extents.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 19fa7b0b8c0..c69682a4262 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -139,11 +139,13 @@ static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) } /* Ensure that future operations on given range of tree will not malloc */ -static int _preload_range(struct my_tree *tree, u64 offset, u64 length) +static int _preload_range(struct pnfs_inval_markings *marks, + u64 offset, u64 length) { u64 start, end, s; int count, i, used = 0, status = -ENOMEM; struct pnfs_inval_tracking **storage; + struct my_tree *tree = &marks->im_tree; dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); start = normalize(offset, tree->mtt_step_size); @@ -161,12 +163,11 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length) goto out_cleanup; } - /* Now need lock - HOW??? */ - + spin_lock(&marks->im_lock); for (s = start; s < end; s += tree->mtt_step_size) used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); + spin_unlock(&marks->im_lock); - /* Unlock - HOW??? */ status = 0; out_cleanup: @@ -286,7 +287,7 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks, start = normalize(offset, marks->im_block_size); end = normalize_up(offset + length, marks->im_block_size); - if (_preload_range(&marks->im_tree, start, end - start)) + if (_preload_range(marks, start, end - start)) goto outerr; spin_lock(&marks->im_lock); -- cgit v1.2.3-70-g09d2 From 82b906d6550ee5fe0d5553359b3c9692dd0aed31 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 12 Jan 2012 23:18:43 +0800 Subject: pnfsblock: set read/write tk_status to pnfs_error To pass the IO status to upper layer. Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 281ae95932c..06fe0802118 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -216,6 +216,7 @@ bl_end_par_io_read(void *data) { struct nfs_read_data *rdata = data; + rdata->task.tk_status = rdata->pnfs_error; INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); schedule_work(&rdata->task.u.tk_work); } @@ -405,7 +406,7 @@ static void bl_end_par_io_write(void *data) { struct nfs_write_data *wdata = data; - wdata->task.tk_status = 0; + wdata->task.tk_status = wdata->pnfs_error; wdata->verf.committed = NFS_FILE_SYNC; INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); schedule_work(&wdata->task.u.tk_work); -- cgit v1.2.3-70-g09d2 From 57582b372f63d0f655b1a35b0d306d73d1a46775 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 12 Jan 2012 23:18:45 +0800 Subject: pnfsblock: clean up _add_entry It is wrong to kmalloc in _add_entry() as it is inside spinlock. memory should be already allocated _add_entry() is called. Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/extents.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index c69682a4262..369ef53e89e 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -110,13 +110,7 @@ static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, return 0; } else { struct pnfs_inval_tracking *new; - if (storage) - new = storage; - else { - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) - return -ENOMEM; - } + new = storage; new->it_sector = s; new->it_tags = (1 << tag); list_add(&new->it_link, &pos->it_link); -- cgit v1.2.3-70-g09d2 From 93a3844ee0f843b05a1df4b52e1a19ff26b98d24 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 12 Jan 2012 23:18:47 +0800 Subject: pnfsblock: don't spinlock when freeing block_dev bl_free_block_dev() may sleep. We can not call it with spinlock held. Besides, there is no need to take bm_lock as we are last user freeing bm_devlist. Cc: #3.1+ Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 06fe0802118..6d39e9ab1e6 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -780,16 +780,13 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) static void free_blk_mountid(struct block_mount_id *mid) { if (mid) { - struct pnfs_block_dev *dev; - spin_lock(&mid->bm_lock); - while (!list_empty(&mid->bm_devlist)) { - dev = list_first_entry(&mid->bm_devlist, - struct pnfs_block_dev, - bm_node); + struct pnfs_block_dev *dev, *tmp; + + /* No need to take bm_lock as we are last user freeing bm_devlist */ + list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { list_del(&dev->bm_node); bl_free_block_dev(dev); } - spin_unlock(&mid->bm_lock); kfree(mid); } } -- cgit v1.2.3-70-g09d2 From 74a6eeb44ca6174d9cc93b9b8b4d58211c57bc80 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 12 Jan 2012 23:18:48 +0800 Subject: pnfsblock: limit bio page count One bio can have at most BIO_MAX_PAGES pages. We should limit it bec otherwise bio_alloc will fail when there are many pages in one read/write_pagelist. Cc: #3.1+ Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 6d39e9ab1e6..baf0bf2acbd 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -146,14 +146,19 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, { struct bio *bio; + npg = min(npg, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, npg); - if (!bio) - return NULL; + if (!bio && (current->flags & PF_MEMALLOC)) { + while (!bio && (npg /= 2)) + bio = bio_alloc(GFP_NOIO, npg); + } - bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; - bio->bi_bdev = be->be_mdev; - bio->bi_end_io = end_io; - bio->bi_private = par; + if (bio) { + bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = end_io; + bio->bi_private = par; + } return bio; } -- cgit v1.2.3-70-g09d2 From 60c52e3a72fda10e82f38b6f979956eb2dcb3d4e Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 12 Jan 2012 23:18:40 +0800 Subject: pnfsblock: cleanup bl_mark_sectors_init It does not need to manipulate on partial initialized blocks. Writeback code takes care of it. Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 6 ++-- fs/nfs/blocklayout/blocklayout.h | 3 +- fs/nfs/blocklayout/extents.c | 76 +++------------------------------------- 3 files changed, 8 insertions(+), 77 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index baf0bf2acbd..a263810803c 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -577,8 +577,7 @@ fill_invalid_ext: unlock_page(page); ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS, - NULL); + PAGE_CACHE_SECTORS); if (unlikely(ret)) { dprintk("%s bl_mark_sectors_init fail %d\n", __func__, ret); @@ -627,8 +626,7 @@ next_page: } if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS, - NULL); + PAGE_CACHE_SECTORS); if (unlikely(ret)) { dprintk("%s bl_mark_sectors_init fail %d\n", __func__, ret); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 42acf7ef599..60728acc7b9 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -186,8 +186,7 @@ struct pnfs_block_extent * bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, struct pnfs_block_extent **cow_read); int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length, - sector_t **pages); + sector_t offset, sector_t length); void bl_put_extent(struct pnfs_block_extent *be); struct pnfs_block_extent *bl_alloc_extent(void); int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 369ef53e89e..d0f52ed2242 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -174,33 +174,6 @@ static int _preload_range(struct pnfs_inval_markings *marks, return status; } -static void set_needs_init(sector_t *array, sector_t offset) -{ - sector_t *p = array; - - dprintk("%s enter\n", __func__); - if (!p) - return; - while (*p < offset) - p++; - if (*p == offset) - return; - else if (*p == ~0) { - *p++ = offset; - *p = ~0; - return; - } else { - sector_t *save = p; - dprintk("%s Adding %llu\n", __func__, (u64)offset); - while (*p != ~0) - p++; - p++; - memmove(save + 1, save, (char *)p - (char *)save); - *save = offset; - return; - } -} - /* We are relying on page lock to serialize this */ int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) { @@ -256,28 +229,15 @@ static int is_range_written(struct pnfs_inval_markings *marks, /* Marks sectors in [offest, offset_length) as having been initialized. * All lengths are step-aligned, where step is min(pagesize, blocksize). - * Notes where partial block is initialized, and helps prepare it for - * complete initialization later. + * Currently assumes offset is page-aligned */ -/* Currently assumes offset is page-aligned */ int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length, - sector_t **pages) + sector_t offset, sector_t length) { - sector_t s, start, end; - sector_t *array = NULL; /* Pages to mark */ + sector_t start, end; dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, (u64)offset, (u64)length); - s = max((sector_t) 3, - 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); - dprintk("%s set max=%llu\n", __func__, (u64)s); - if (pages) { - array = kmalloc(s * sizeof(sector_t), GFP_NOFS); - if (!array) - goto outerr; - array[0] = ~0; - } start = normalize(offset, marks->im_block_size); end = normalize_up(offset + length, marks->im_block_size); @@ -285,41 +245,15 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks, goto outerr; spin_lock(&marks->im_lock); - - for (s = normalize_up(start, PAGE_CACHE_SECTORS); - s < offset; s += PAGE_CACHE_SECTORS) { - dprintk("%s pre-area pages\n", __func__); - /* Portion of used block is not initialized */ - if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) - set_needs_init(array, s); - } if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) goto out_unlock; - for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); - s < end; s += PAGE_CACHE_SECTORS) { - dprintk("%s post-area pages\n", __func__); - if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) - set_needs_init(array, s); - } - spin_unlock(&marks->im_lock); - if (pages) { - if (array[0] == ~0) { - kfree(array); - *pages = NULL; - } else - *pages = array; - } return 0; - out_unlock: +out_unlock: spin_unlock(&marks->im_lock); - outerr: - if (pages) { - kfree(array); - *pages = NULL; - } +outerr: return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From 72c508879979522de347bcec706507e00d7c443d Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 12 Jan 2012 23:18:42 +0800 Subject: pnfsblock: move find lock page logic out of bl_write_pagelist Also avoid unnecessary lock_page if page is handled by others. Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 78 +++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index a263810803c..23427362185 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -490,6 +490,55 @@ cleanup: return ret; } +/* Find or create a zeroing page marked being writeback. + * Return ERR_PTR on error, NULL to indicate skip this page and page itself + * to indicate write out. + */ +static struct page * +bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, + struct pnfs_block_extent *cow_read) +{ + struct page *page; + int locked = 0; + page = find_get_page(inode->i_mapping, index); + if (page) + goto check_page; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s oom\n", __func__); + return ERR_PTR(-ENOMEM); + } + locked = 1; + +check_page: + /* PageDirty: Other will write this out + * PageWriteback: Other is writing this out + * PageUptodate: It was read before + */ + if (PageDirty(page) || PageWriteback(page)) { + print_page(page); + if (locked) + unlock_page(page); + page_cache_release(page); + return NULL; + } + + if (!locked) { + lock_page(page); + locked = 1; + goto check_page; + } + if (!PageUptodate(page)) { + /* New page, readin or zero it */ + init_page_for_write(page, cow_read); + } + set_page_writeback(page); + unlock_page(page); + + return page; +} + static enum pnfs_try_status bl_write_pagelist(struct nfs_write_data *wdata, int sync) { @@ -549,32 +598,13 @@ fill_invalid_ext: dprintk("%s zero %dth page: index %lu isect %llu\n", __func__, npg_zero, index, (unsigned long long)isect); - page = - find_or_create_page(wdata->inode->i_mapping, index, - GFP_NOFS); - if (!page) { - dprintk("%s oom\n", __func__); - wdata->pnfs_error = -ENOMEM; + page = bl_find_get_zeroing_page(wdata->inode, index, + cow_read); + if (unlikely(IS_ERR(page))) { + wdata->pnfs_error = PTR_ERR(page); goto out; - } - - /* PageDirty: Other will write this out - * PageWriteback: Other is writing this out - * PageUptodate: It was read before - * sector_initialized: already written out - */ - if (PageDirty(page) || PageWriteback(page)) { - print_page(page); - unlock_page(page); - page_cache_release(page); + } else if (page == NULL) goto next_page; - } - if (!PageUptodate(page)) { - /* New page, readin or zero it */ - init_page_for_write(page, cow_read); - } - set_page_writeback(page); - unlock_page(page); ret = bl_mark_sectors_init(be->be_inval, isect, PAGE_CACHE_SECTORS); -- cgit v1.2.3-70-g09d2 From c0411a94a8f318379464e29dd81db806249dbca6 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 12 Jan 2012 23:18:44 +0800 Subject: pnfsblock: remove rpc_call_ops from struct parallel_io block layout can just make use of generic read/write_done. Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 23427362185..9215c6644a3 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -90,7 +90,6 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect) */ struct parallel_io { struct kref refcnt; - struct rpc_call_ops call_ops; void (*pnfs_callback) (void *data); void *data; }; @@ -226,14 +225,6 @@ bl_end_par_io_read(void *data) schedule_work(&rdata->task.u.tk_work); } -/* We don't want normal .rpc_call_done callback used, so we replace it - * with this stub. - */ -static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) -{ - return; -} - static enum pnfs_try_status bl_read_pagelist(struct nfs_read_data *rdata) { @@ -253,8 +244,6 @@ bl_read_pagelist(struct nfs_read_data *rdata) par = alloc_parallel(rdata); if (!par) goto use_mds; - par->call_ops = *rdata->mds_ops; - par->call_ops.rpc_call_done = bl_rpc_do_nothing; par->pnfs_callback = bl_end_par_io_read; /* At this point, we can no longer jump to use_mds */ @@ -564,8 +553,6 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) par = alloc_parallel(wdata); if (!par) return PNFS_NOT_ATTEMPTED; - par->call_ops = *wdata->mds_ops; - par->call_ops.rpc_call_done = bl_rpc_do_nothing; par->pnfs_callback = bl_end_par_io_write; /* At this point, have to be more careful with error handling */ -- cgit v1.2.3-70-g09d2 From 7c5465d6ccd759caa959828e2add5603518dafc4 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 12 Jan 2012 23:18:46 +0800 Subject: pnfsblock: alloc short extent before submit bio As discussed earlier, it is better for block client to allocate memory for tracking extents state before submitting bio. So the patch does it by allocating a short_extent for every INVALID extent touched by write pagelist and for every zeroing page we created, saving them in layout header. Then in end_io we can just use them to create commit list items and avoid memory allocation there. Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 74 ++++++++++++++++++++++++++-------- fs/nfs/blocklayout/blocklayout.h | 9 ++++- fs/nfs/blocklayout/extents.c | 85 ++++++++++++++++++++++++++++++---------- 3 files changed, 131 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 9215c6644a3..48cfac31f64 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect) */ struct parallel_io { struct kref refcnt; - void (*pnfs_callback) (void *data); + void (*pnfs_callback) (void *data, int num_se); void *data; + int bse_count; }; static inline struct parallel_io *alloc_parallel(void *data) @@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data) if (rv) { rv->data = data; kref_init(&rv->refcnt); + rv->bse_count = 0; } return rv; } @@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref) struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); dprintk("%s enter\n", __func__); - p->pnfs_callback(p->data); + p->pnfs_callback(p->data, p->bse_count); kfree(p); } @@ -216,7 +218,7 @@ static void bl_read_cleanup(struct work_struct *work) } static void -bl_end_par_io_read(void *data) +bl_end_par_io_read(void *data, int unused) { struct nfs_read_data *rdata = data; @@ -317,6 +319,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl, { sector_t isect, end; struct pnfs_block_extent *be; + struct pnfs_block_short_extent *se; dprintk("%s(%llu, %u)\n", __func__, offset, count); if (count == 0) @@ -329,8 +332,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl, be = bl_find_get_extent(bl, isect, NULL); BUG_ON(!be); /* FIXME */ len = min(end, be->be_f_offset + be->be_length) - isect; - if (be->be_state == PNFS_BLOCK_INVALID_DATA) - bl_mark_for_commit(be, isect, len); /* What if fails? */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + se = bl_pop_one_short_extent(be->be_inval); + BUG_ON(!se); + bl_mark_for_commit(be, isect, len, se); + } isect += len; bl_put_extent(be); } @@ -352,7 +358,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err) end_page_writeback(page); page_cache_release(page); } while (bvec >= bio->bi_io_vec); - if (!uptodate) { + + if (unlikely(!uptodate)) { if (!wdata->pnfs_error) wdata->pnfs_error = -EIO; pnfs_set_lo_fail(wdata->lseg); @@ -361,7 +368,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err) put_parallel(par); } -/* This is basically copied from mpage_end_io_read */ static void bl_end_io_write(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; @@ -387,7 +393,7 @@ static void bl_write_cleanup(struct work_struct *work) dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); wdata = container_of(task, struct nfs_write_data, task); - if (!wdata->pnfs_error) { + if (likely(!wdata->pnfs_error)) { /* Marks for LAYOUTCOMMIT */ mark_extents_written(BLK_LSEG2EXT(wdata->lseg), wdata->args.offset, wdata->args.count); @@ -396,10 +402,15 @@ static void bl_write_cleanup(struct work_struct *work) } /* Called when last of bios associated with a bl_write_pagelist call finishes */ -static void bl_end_par_io_write(void *data) +static void bl_end_par_io_write(void *data, int num_se) { struct nfs_write_data *wdata = data; + if (unlikely(wdata->pnfs_error)) { + bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval, + num_se); + } + wdata->task.tk_status = wdata->pnfs_error; wdata->verf.committed = NFS_FILE_SYNC; INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); @@ -552,7 +563,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) */ par = alloc_parallel(wdata); if (!par) - return PNFS_NOT_ATTEMPTED; + goto out_mds; par->pnfs_callback = bl_end_par_io_write; /* At this point, have to be more careful with error handling */ @@ -560,12 +571,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); if (!be || !is_writable(be, isect)) { dprintk("%s no matching extents!\n", __func__); - wdata->pnfs_error = -EINVAL; - goto out; + goto out_mds; } /* First page inside INVALID extent */ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (likely(!bl_push_one_short_extent(be->be_inval))) + par->bse_count++; + else + goto out_mds; temp = offset >> PAGE_CACHE_SHIFT; npg_zero = do_div(temp, npg_per_block); isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & @@ -603,6 +617,19 @@ fill_invalid_ext: wdata->pnfs_error = ret; goto out; } + if (likely(!bl_push_one_short_extent(be->be_inval))) + par->bse_count++; + else { + end_page_writeback(page); + page_cache_release(page); + wdata->pnfs_error = -ENOMEM; + goto out; + } + /* FIXME: This should be done in bi_end_io */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE); + bio = bl_add_page_to_bio(bio, npg_zero, WRITE, isect, page, be, bl_end_io_write_zero, par); @@ -611,10 +638,6 @@ fill_invalid_ext: bio = NULL; goto out; } - /* FIXME: This should be done in bi_end_io */ - mark_extents_written(BLK_LSEG2EXT(wdata->lseg), - page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE); next_page: isect += PAGE_CACHE_SECTORS; extent_length -= PAGE_CACHE_SECTORS; @@ -638,6 +661,15 @@ next_page: wdata->pnfs_error = -EINVAL; goto out; } + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (likely(!bl_push_one_short_extent( + be->be_inval))) + par->bse_count++; + else { + wdata->pnfs_error = -ENOMEM; + goto out; + } + } extent_length = be->be_length - (isect - be->be_f_offset); } @@ -685,6 +717,10 @@ out: bl_submit_bio(WRITE, bio); put_parallel(par); return PNFS_ATTEMPTED; +out_mds: + bl_put_extent(be); + kfree(par); + return PNFS_NOT_ATTEMPTED; } /* FIXME - range ignored */ @@ -711,11 +747,17 @@ static void release_inval_marks(struct pnfs_inval_markings *marks) { struct pnfs_inval_tracking *pos, *temp; + struct pnfs_block_short_extent *se, *stemp; list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { list_del(&pos->it_link); kfree(pos); } + + list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { + list_del(&se->bse_node); + kfree(se); + } return; } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 60728acc7b9..e31a2df28e7 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -70,6 +70,7 @@ struct pnfs_inval_markings { spinlock_t im_lock; struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ sector_t im_block_size; /* Server blocksize in sectors */ + struct list_head im_extents; /* Short extents for INVAL->RW conversion */ }; struct pnfs_inval_tracking { @@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) { spin_lock_init(&marks->im_lock); INIT_LIST_HEAD(&marks->im_tree.mtt_stub); + INIT_LIST_HEAD(&marks->im_extents); marks->im_block_size = blocksize; marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, blocksize); @@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, int bl_add_merge_extent(struct pnfs_block_layout *bl, struct pnfs_block_extent *new); int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length); + sector_t offset, sector_t length, + struct pnfs_block_short_extent *new); +int bl_push_one_short_extent(struct pnfs_inval_markings *marks); +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks); +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index d0f52ed2242..1abac09f7cd 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -157,10 +157,10 @@ static int _preload_range(struct pnfs_inval_markings *marks, goto out_cleanup; } - spin_lock(&marks->im_lock); + spin_lock_bh(&marks->im_lock); for (s = start; s < end; s += tree->mtt_step_size) used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); status = 0; @@ -179,9 +179,9 @@ int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) { int rv; - spin_lock(&marks->im_lock); + spin_lock_bh(&marks->im_lock); rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); return rv; } @@ -221,9 +221,9 @@ static int is_range_written(struct pnfs_inval_markings *marks, { int rv; - spin_lock(&marks->im_lock); + spin_lock_bh(&marks->im_lock); rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); return rv; } @@ -244,15 +244,15 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks, if (_preload_range(marks, start, end - start)) goto outerr; - spin_lock(&marks->im_lock); + spin_lock_bh(&marks->im_lock); if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) goto out_unlock; - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); return 0; out_unlock: - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); outerr: return -ENOMEM; } @@ -267,9 +267,9 @@ static int mark_written_sectors(struct pnfs_inval_markings *marks, dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, (u64)offset, (u64)length); - spin_lock(&marks->im_lock); + spin_lock_bh(&marks->im_lock); status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); return status; } @@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl, /* Note the range described by offset, length is guaranteed to be contained * within be. + * new will be freed, either by this function or add_to_commitlist if they + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist. */ int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length) + sector_t offset, sector_t length, + struct pnfs_block_short_extent *new) { sector_t new_end, end = offset + length; - struct pnfs_block_short_extent *new; struct pnfs_block_layout *bl = container_of(be->be_inval, struct pnfs_block_layout, bl_inval); - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) - return -ENOMEM; - mark_written_sectors(be->be_inval, offset, length); /* We want to add the range to commit list, but it must be * block-normalized, and verified that the normalized range has @@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be, new->bse_mdev = be->be_mdev; spin_lock(&bl->bl_ext_lock); - /* new will be freed, either by add_to_commitlist if it decides not - * to use it, or after LAYOUTCOMMIT uses it in the commitlist. - */ add_to_commitlist(bl, new); spin_unlock(&bl->bl_ext_lock); return 0; @@ -862,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, } } } + +int bl_push_one_short_extent(struct pnfs_inval_markings *marks) +{ + struct pnfs_block_short_extent *new; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (unlikely(!new)) + return -ENOMEM; + + spin_lock_bh(&marks->im_lock); + list_add(&new->bse_node, &marks->im_extents); + spin_unlock_bh(&marks->im_lock); + + return 0; +} + +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks) +{ + struct pnfs_block_short_extent *rv = NULL; + + spin_lock_bh(&marks->im_lock); + if (!list_empty(&marks->im_extents)) { + rv = list_entry((&marks->im_extents)->next, + struct pnfs_block_short_extent, bse_node); + list_del_init(&rv->bse_node); + } + spin_unlock_bh(&marks->im_lock); + + return rv; +} + +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free) +{ + struct pnfs_block_short_extent *se = NULL, *tmp; + + if (num_to_free <= 0) + return; + + spin_lock(&marks->im_lock); + list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) { + list_del(&se->bse_node); + kfree(se); + if (--num_to_free == 0) + break; + } + spin_unlock(&marks->im_lock); + + BUG_ON(num_to_free > 0); +} -- cgit v1.2.3-70-g09d2