summaryrefslogtreecommitdiffstats
path: root/fs/nfs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2014-11-20 14:42:02 +0100
committerJiri Kosina <jkosina@suse.cz>2014-11-20 14:42:02 +0100
commita02001086bbfb4da35d1228bebc2f1b442db455f (patch)
tree62ab47936cef06fd08657ca5b6cd1df98c19be57 /fs/nfs
parenteff264efeeb0898408e8c9df72d8a32621035bed (diff)
parentfc14f9c1272f62c3e8d01300f52467c0d9af50f9 (diff)
Merge Linus' tree to be be to apply submitted patches to newer code than
current trivial.git base
Diffstat (limited to 'fs/nfs')
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/blocklayout/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1435
-rw-r--r--fs/nfs/blocklayout/blocklayout.h213
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c384
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c108
-rw-r--r--fs/nfs/blocklayout/dev.c363
-rw-r--r--fs/nfs/blocklayout/extent_tree.c602
-rw-r--r--fs/nfs/blocklayout/extents.c908
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c288
-rw-r--r--fs/nfs/callback.c16
-rw-r--r--fs/nfs/callback_proc.c23
-rw-r--r--fs/nfs/client.c111
-rw-r--r--fs/nfs/delegation.c59
-rw-r--r--fs/nfs/delegation.h2
-rw-r--r--fs/nfs/dir.c216
-rw-r--r--fs/nfs/direct.c60
-rw-r--r--fs/nfs/file.c69
-rw-r--r--fs/nfs/filelayout/filelayout.c334
-rw-r--r--fs/nfs/filelayout/filelayout.h7
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c108
-rw-r--r--fs/nfs/fscache-index.c3
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c2
-rw-r--r--fs/nfs/inode.c20
-rw-r--r--fs/nfs/internal.h28
-rw-r--r--fs/nfs/netns.h4
-rw-r--r--fs/nfs/nfs3_fs.h34
-rw-r--r--fs/nfs/nfs3acl.c8
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs3proc.c22
-rw-r--r--fs/nfs/nfs3super.c1
-rw-r--r--fs/nfs/nfs42.h14
-rw-r--r--fs/nfs/nfs42proc.c69
-rw-r--r--fs/nfs/nfs42xdr.c98
-rw-r--r--fs/nfs/nfs4_fs.h24
-rw-r--r--fs/nfs/nfs4client.c43
-rw-r--r--fs/nfs/nfs4file.c27
-rw-r--r--fs/nfs/nfs4proc.c493
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c63
-rw-r--r--fs/nfs/nfs4trace.h28
-rw-r--r--fs/nfs/nfs4xdr.c188
-rw-r--r--fs/nfs/objlayout/objio_osd.c139
-rw-r--r--fs/nfs/objlayout/objlayout.c153
-rw-r--r--fs/nfs/objlayout/objlayout.h15
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c2
-rw-r--r--fs/nfs/pagelist.c324
-rw-r--r--fs/nfs/pnfs.c304
-rw-r--r--fs/nfs/pnfs.h92
-rw-r--r--fs/nfs/pnfs_dev.c150
-rw-r--r--fs/nfs/proc.c27
-rw-r--r--fs/nfs/read.c54
-rw-r--r--fs/nfs/super.c23
-rw-r--r--fs/nfs/write.c245
55 files changed, 3881 insertions, 4141 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4782e0840dc..04cb830fa09 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -28,6 +28,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c02..3ca14c36d08 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
# Makefile for the pNFS block layout driver kernel module
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
+
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9b431f44fad..4f46f7a0528 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/bio.h> /* struct bio */
-#include <linux/buffer_head.h> /* various write calls */
#include <linux/prefetch.h>
#include <linux/pagevec.h>
@@ -50,40 +49,16 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
-static void print_page(struct page *page)
+static bool is_hole(struct pnfs_block_extent *be)
{
- dprintk("PRINTPAGE page %p\n", page);
- dprintk(" PagePrivate %d\n", PagePrivate(page));
- dprintk(" PageUptodate %d\n", PageUptodate(page));
- dprintk(" PageError %d\n", PageError(page));
- dprintk(" PageDirty %d\n", PageDirty(page));
- dprintk(" PageReferenced %d\n", PageReferenced(page));
- dprintk(" PageLocked %d\n", PageLocked(page));
- dprintk(" PageWriteback %d\n", PageWriteback(page));
- dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
- dprintk("\n");
-}
-
-/* Given the be associated with isect, determine if page data needs to be
- * initialized.
- */
-static int is_hole(struct pnfs_block_extent *be, sector_t isect)
-{
- if (be->be_state == PNFS_BLOCK_NONE_DATA)
- return 1;
- else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
- return 0;
- else
- return !bl_is_sector_init(be->be_inval, isect);
-}
-
-/* Given the be associated with isect, determine if page data can be
- * written to disk.
- */
-static int is_writable(struct pnfs_block_extent *be, sector_t isect)
-{
- return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
- be->be_state == PNFS_BLOCK_INVALID_DATA);
+ switch (be->be_state) {
+ case PNFS_BLOCK_NONE_DATA:
+ return true;
+ case PNFS_BLOCK_INVALID_DATA:
+ return be->be_tag ? false : true;
+ default:
+ return false;
+ }
}
/* The data we are handed might be spread across several bios. We need
@@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
*/
struct parallel_io {
struct kref refcnt;
- void (*pnfs_callback) (void *data, int num_se);
+ void (*pnfs_callback) (void *data);
void *data;
- int bse_count;
};
static inline struct parallel_io *alloc_parallel(void *data)
@@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
if (rv) {
rv->data = data;
kref_init(&rv->refcnt);
- rv->bse_count = 0;
}
return rv;
}
@@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
dprintk("%s enter\n", __func__);
- p->pnfs_callback(p->data, p->bse_count);
+ p->pnfs_callback(p->data);
kfree(p);
}
@@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio)
return NULL;
}
-static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par)
+static struct bio *
+bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+ void (*end_io)(struct bio *, int err), struct parallel_io *par)
{
struct bio *bio;
@@ -156,67 +128,73 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
}
if (bio) {
- bio->bi_iter.bi_sector = isect - be->be_f_offset +
- be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
+ bio->bi_iter.bi_sector = disk_sector;
+ bio->bi_bdev = bdev;
bio->bi_end_io = end_io;
bio->bi_private = par;
}
return bio;
}
-static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
- sector_t isect, struct page *page,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par,
- unsigned int offset, int len)
+static struct bio *
+do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
+ struct page *page, struct pnfs_block_dev_map *map,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par, unsigned int offset, int *len)
{
- isect = isect + (offset >> SECTOR_SHIFT);
+ struct pnfs_block_dev *dev =
+ container_of(be->be_device, struct pnfs_block_dev, node);
+ u64 disk_addr, end;
+
dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
- npg, rw, (unsigned long long)isect, offset, len);
+ npg, rw, (unsigned long long)isect, offset, *len);
+
+ /* translate to device offset */
+ isect += be->be_v_offset;
+ isect -= be->be_f_offset;
+
+ /* translate to physical disk offset */
+ disk_addr = (u64)isect << SECTOR_SHIFT;
+ if (disk_addr < map->start || disk_addr >= map->start + map->len) {
+ if (!dev->map(dev, disk_addr, map))
+ return ERR_PTR(-EIO);
+ bio = bl_submit_bio(rw, bio);
+ }
+ disk_addr += map->disk_offset;
+ disk_addr -= map->start;
+
+ /* limit length to what the device mapping allows */
+ end = disk_addr + *len;
+ if (end >= map->start + map->len)
+ *len = map->start + map->len - disk_addr;
+
retry:
if (!bio) {
- bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+ bio = bl_alloc_init_bio(npg, map->bdev,
+ disk_addr >> SECTOR_SHIFT, end_io, par);
if (!bio)
return ERR_PTR(-ENOMEM);
}
- if (bio_add_page(bio, page, len, offset) < len) {
+ if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(rw, bio);
goto retry;
}
return bio;
}
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
- sector_t isect, struct page *page,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par)
-{
- return do_add_page_to_bio(bio, npg, rw, isect, page, be,
- end_io, par, 0, PAGE_CACHE_SIZE);
-}
-
-/* This is basically copied from mpage_end_io_read */
static void bl_end_io_read(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
- struct bio_vec *bvec;
- int i;
-
- if (!err)
- bio_for_each_segment_all(bvec, bio, i)
- SetPageUptodate(bvec->bv_page);
if (err) {
- struct nfs_pgio_data *rdata = par->data;
- struct nfs_pgio_header *header = rdata->header;
+ struct nfs_pgio_header *header = par->data;
if (!header->pnfs_error)
header->pnfs_error = -EIO;
pnfs_set_lo_fail(header->lseg);
}
+
bio_put(bio);
put_parallel(par);
}
@@ -224,104 +202,96 @@ static void bl_end_io_read(struct bio *bio, int err)
static void bl_read_cleanup(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_pgio_data *rdata;
+ struct nfs_pgio_header *hdr;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- rdata = container_of(task, struct nfs_pgio_data, task);
- pnfs_ld_read_done(rdata);
+ hdr = container_of(task, struct nfs_pgio_header, task);
+ pnfs_ld_read_done(hdr);
}
static void
-bl_end_par_io_read(void *data, int unused)
+bl_end_par_io_read(void *data)
{
- struct nfs_pgio_data *rdata = data;
+ struct nfs_pgio_header *hdr = data;
- rdata->task.tk_status = rdata->header->pnfs_error;
- INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
- schedule_work(&rdata->task.u.tk_work);
+ hdr->task.tk_status = hdr->pnfs_error;
+ INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
+ schedule_work(&hdr->task.u.tk_work);
}
static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_data *rdata)
+bl_read_pagelist(struct nfs_pgio_header *header)
{
- struct nfs_pgio_header *header = rdata->header;
- int i, hole;
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
- struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+ struct pnfs_block_extent be;
sector_t isect, extent_length = 0;
struct parallel_io *par;
- loff_t f_offset = rdata->args.offset;
- size_t bytes_left = rdata->args.count;
+ loff_t f_offset = header->args.offset;
+ size_t bytes_left = header->args.count;
unsigned int pg_offset, pg_len;
- struct page **pages = rdata->args.pages;
- int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+ struct page **pages = header->args.pages;
+ int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
const bool is_dio = (header->dreq != NULL);
+ struct blk_plug plug;
+ int i;
dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
- rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
+ header->page_array.npages, f_offset,
+ (unsigned int)header->args.count);
- par = alloc_parallel(rdata);
+ par = alloc_parallel(header);
if (!par)
- goto use_mds;
+ return PNFS_NOT_ATTEMPTED;
par->pnfs_callback = bl_end_par_io_read;
- /* At this point, we can no longer jump to use_mds */
+
+ blk_start_plug(&plug);
isect = (sector_t) (f_offset >> SECTOR_SHIFT);
/* Code assumes extents are page-aligned */
- for (i = pg_index; i < rdata->pages.npages; i++) {
- if (!extent_length) {
+ for (i = pg_index; i < header->page_array.npages; i++) {
+ if (extent_length <= 0) {
/* We've used up the previous extent */
- bl_put_extent(be);
- bl_put_extent(cow_read);
bio = bl_submit_bio(READ, bio);
+
/* Get the next one */
- be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
- isect, &cow_read);
- if (!be) {
+ if (!ext_tree_lookup(bl, isect, &be, false)) {
header->pnfs_error = -EIO;
goto out;
}
- extent_length = be->be_length -
- (isect - be->be_f_offset);
- if (cow_read) {
- sector_t cow_length = cow_read->be_length -
- (isect - cow_read->be_f_offset);
- extent_length = min(extent_length, cow_length);
- }
+ extent_length = be.be_length - (isect - be.be_f_offset);
}
+ pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (is_dio) {
- pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
pg_len = PAGE_CACHE_SIZE - pg_offset;
else
pg_len = bytes_left;
-
- f_offset += pg_len;
- bytes_left -= pg_len;
- isect += (pg_offset >> SECTOR_SHIFT);
} else {
- pg_offset = 0;
+ BUG_ON(pg_offset != 0);
pg_len = PAGE_CACHE_SIZE;
}
- hole = is_hole(be, isect);
- if (hole && !cow_read) {
+ isect += (pg_offset >> SECTOR_SHIFT);
+ extent_length -= (pg_offset >> SECTOR_SHIFT);
+
+ if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
dprintk("%s Zeroing page for hole\n", __func__);
zero_user_segment(pages[i], pg_offset, pg_len);
- print_page(pages[i]);
- SetPageUptodate(pages[i]);
- } else {
- struct pnfs_block_extent *be_read;
- be_read = (hole && cow_read) ? cow_read : be;
- bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
+ /* invalidate map */
+ map.start = NFS4_MAX_UINT64;
+ } else {
+ bio = do_add_page_to_bio(bio,
+ header->page_array.npages - i,
READ,
- isect, pages[i], be_read,
+ isect, pages[i], &map, &be,
bl_end_io_read, par,
- pg_offset, pg_len);
+ pg_offset, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
@@ -329,84 +299,28 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
}
}
isect += (pg_len >> SECTOR_SHIFT);
- extent_length -= PAGE_CACHE_SECTORS;
+ extent_length -= (pg_len >> SECTOR_SHIFT);
+ f_offset += pg_len;
+ bytes_left -= pg_len;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
- rdata->res.eof = 1;
- rdata->res.count = header->inode->i_size - rdata->args.offset;
+ header->res.eof = 1;
+ header->res.count = header->inode->i_size - header->args.offset;
} else {
- rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
+ header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
}
out:
- bl_put_extent(be);
- bl_put_extent(cow_read);
bl_submit_bio(READ, bio);
+ blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
-
- use_mds:
- dprintk("Giving up and using normal NFS\n");
- return PNFS_NOT_ATTEMPTED;
-}
-
-static void mark_extents_written(struct pnfs_block_layout *bl,
- __u64 offset, __u32 count)
-{
- sector_t isect, end;
- struct pnfs_block_extent *be;
- struct pnfs_block_short_extent *se;
-
- dprintk("%s(%llu, %u)\n", __func__, offset, count);
- if (count == 0)
- return;
- isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
- end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
- end >>= SECTOR_SHIFT;
- while (isect < end) {
- sector_t len;
- be = bl_find_get_extent(bl, isect, NULL);
- BUG_ON(!be); /* FIXME */
- len = min(end, be->be_f_offset + be->be_length) - isect;
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- se = bl_pop_one_short_extent(be->be_inval);
- BUG_ON(!se);
- bl_mark_for_commit(be, isect, len, se);
- }
- isect += len;
- bl_put_extent(be);
- }
-}
-
-static void bl_end_io_write_zero(struct bio *bio, int err)
-{
- struct parallel_io *par = bio->bi_private;
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- /* This is the zeroing page we added */
- end_page_writeback(bvec->bv_page);
- page_cache_release(bvec->bv_page);
- }
-
- if (unlikely(err)) {
- struct nfs_pgio_data *data = par->data;
- struct nfs_pgio_header *header = data->header;
-
- if (!header->pnfs_error)
- header->pnfs_error = -EIO;
- pnfs_set_lo_fail(header->lseg);
- }
- bio_put(bio);
- put_parallel(par);
}
static void bl_end_io_write(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct nfs_pgio_data *data = par->data;
- struct nfs_pgio_header *header = data->header;
+ struct nfs_pgio_header *header = par->data;
if (!uptodate) {
if (!header->pnfs_error)
@@ -422,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err)
*/
static void bl_write_cleanup(struct work_struct *work)
{
- struct rpc_task *task;
- struct nfs_pgio_data *wdata;
- dprintk("%s enter\n", __func__);
- task = container_of(work, struct rpc_task, u.tk_work);
- wdata = container_of(task, struct nfs_pgio_data, task);
- if (likely(!wdata->header->pnfs_error)) {
- /* Marks for LAYOUTCOMMIT */
- mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
- wdata->args.offset, wdata->args.count);
- }
- pnfs_ld_write_done(wdata);
-}
-
-/* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data, int num_se)
-{
- struct nfs_pgio_data *wdata = data;
-
- if (unlikely(wdata->header->pnfs_error)) {
- bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
- num_se);
- }
-
- wdata->task.tk_status = wdata->header->pnfs_error;
- wdata->verf.committed = NFS_FILE_SYNC;
- INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
- schedule_work(&wdata->task.u.tk_work);
-}
-
-/* FIXME STUB - mark intersection of layout and page as bad, so is not
- * used again.
- */
-static void mark_bad_read(void)
-{
- return;
-}
-
-/*
- * map_block: map a requested I/0 block (isect) into an offset in the LVM
- * block_device
- */
-static void
-map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
-{
- dprintk("%s enter be=%p\n", __func__, be);
-
- set_buffer_mapped(bh);
- bh->b_bdev = be->be_mdev;
- bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
- (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
-
- dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
- __func__, (unsigned long long)isect, (long)bh->b_blocknr,
- bh->b_size);
- return;
-}
-
-static void
-bl_read_single_end_io(struct bio *bio, int error)
-{
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct page *page = bvec->bv_page;
-
- /* Only one page in bvec */
- unlock_page(page);
-}
-
-static int
-bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
- unsigned int offset, unsigned int len)
-{
- struct bio *bio;
- struct page *shadow_page;
- sector_t isect;
- char *kaddr, *kshadow_addr;
- int ret = 0;
+ struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
+ struct nfs_pgio_header *hdr =
+ container_of(task, struct nfs_pgio_header, task);
- dprintk("%s: offset %u len %u\n", __func__, offset, len);
-
- shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (shadow_page == NULL)
- return -ENOMEM;
-
- bio = bio_alloc(GFP_NOIO, 1);
- if (bio == NULL)
- return -ENOMEM;
-
- isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
- (offset / SECTOR_SIZE);
-
- bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
- bio->bi_end_io = bl_read_single_end_io;
-
- lock_page(shadow_page);
- if (bio_add_page(bio, shadow_page,
- SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
- unlock_page(shadow_page);
- bio_put(bio);
- return -EIO;
- }
-
- submit_bio(READ, bio);
- wait_on_page_locked(shadow_page);
- if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
- ret = -EIO;
- } else {
- kaddr = kmap_atomic(page);
- kshadow_addr = kmap_atomic(shadow_page);
- memcpy(kaddr + offset, kshadow_addr + offset, len);
- kunmap_atomic(kshadow_addr);
- kunmap_atomic(kaddr);
- }
- __free_page(shadow_page);
- bio_put(bio);
-
- return ret;
-}
-
-static int
-bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
- unsigned int dirty_offset, unsigned int dirty_len,
- bool full_page)
-{
- int ret = 0;
- unsigned int start, end;
+ dprintk("%s enter\n", __func__);
- if (full_page) {
- start = 0;
- end = PAGE_CACHE_SIZE;
- } else {
- start = round_down(dirty_offset, SECTOR_SIZE);
- end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
- }
+ if (likely(!hdr->pnfs_error)) {
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
+ u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
+ u64 end = (hdr->args.offset + hdr->args.count +
+ PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
- dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
- if (!be) {
- zero_user_segments(page, start, dirty_offset,
- dirty_offset + dirty_len, end);
- if (start == 0 && end == PAGE_CACHE_SIZE &&
- trylock_page(page)) {
- SetPageUptodate(page);
- unlock_page(page);
- }
- return ret;
+ ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
+ (end - start) >> SECTOR_SHIFT);
}
- if (start != dirty_offset)
- ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
-
- if (!ret && (dirty_offset + dirty_len < end))
- ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
- end - dirty_offset - dirty_len);
-
- return ret;
+ pnfs_ld_write_done(hdr);
}
-/* Given an unmapped page, zero it or read in page for COW, page is locked
- * by caller.
- */
-static int
-init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void bl_end_par_io_write(void *data)
{
- struct buffer_head *bh = NULL;
- int ret = 0;
- sector_t isect;
-
- dprintk("%s enter, %p\n", __func__, page);
- BUG_ON(PageUptodate(page));
- if (!cow_read) {
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
- goto cleanup;
- }
-
- bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
- if (!bh) {
- ret = -ENOMEM;
- goto cleanup;
- }
+ struct nfs_pgio_header *hdr = data;
- isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
- map_block(bh, isect, cow_read);
- if (!bh_uptodate_or_lock(bh))
- ret = bh_submit_read(bh);
- if (ret)
- goto cleanup;
- SetPageUptodate(page);
-
-cleanup:
- if (bh)
- free_buffer_head(bh);
- if (ret) {
- /* Need to mark layout with bad read...should now
- * just use nfs4 for reads and writes.
- */
- mark_bad_read();
- }
- return ret;
-}
-
-/* Find or create a zeroing page marked being writeback.
- * Return ERR_PTR on error, NULL to indicate skip this page and page itself
- * to indicate write out.
- */
-static struct page *
-bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
- struct pnfs_block_extent *cow_read)
-{
- struct page *page;
- int locked = 0;
- page = find_get_page(inode->i_mapping, index);
- if (page)
- goto check_page;
-
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
- if (unlikely(!page)) {
- dprintk("%s oom\n", __func__);
- return ERR_PTR(-ENOMEM);
- }
- locked = 1;
-
-check_page:
- /* PageDirty: Other will write this out
- * PageWriteback: Other is writing this out
- * PageUptodate: It was read before
- */
- if (PageDirty(page) || PageWriteback(page)) {
- print_page(page);
- if (locked)
- unlock_page(page);
- page_cache_release(page);
- return NULL;
- }
-
- if (!locked) {
- lock_page(page);
- locked = 1;
- goto check_page;
- }
- if (!PageUptodate(page)) {
- /* New page, readin or zero it */
- init_page_for_write(page, cow_read);
- }
- set_page_writeback(page);
- unlock_page(page);
-
- return page;
+ hdr->task.tk_status = hdr->pnfs_error;
+ hdr->verf.committed = NFS_FILE_SYNC;
+ INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
+ schedule_work(&hdr->task.u.tk_work);
}
static enum pnfs_try_status
-bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
+bl_write_pagelist(struct nfs_pgio_header *header, int sync)
{
- struct nfs_pgio_header *header = wdata->header;
- int i, ret, npg_zero, pg_index, last = 0;
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
- struct pnfs_block_extent *be = NULL, *cow_read = NULL;
- sector_t isect, last_isect = 0, extent_length = 0;
+ struct pnfs_block_extent be;
+ sector_t isect, extent_length = 0;
struct parallel_io *par = NULL;
- loff_t offset = wdata->args.offset;
- size_t count = wdata->args.count;
- unsigned int pg_offset, pg_len, saved_len;
- struct page **pages = wdata->args.pages;
- struct page *page;
- pgoff_t index;
- u64 temp;
- int npg_per_block =
- NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+ loff_t offset = header->args.offset;
+ size_t count = header->args.count;
+ struct page **pages = header->args.pages;
+ int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ unsigned int pg_len;
+ struct blk_plug plug;
+ int i;
dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
- if (header->dreq != NULL &&
- (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
- !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
- dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
- goto out_mds;
- }
- /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+ /* At this point, header->page_aray is a (sequential) list of nfs_pages.
* We want to write each, and if there is an error set pnfs_error
* to have it redone using nfs.
*/
- par = alloc_parallel(wdata);
+ par = alloc_parallel(header);
if (!par)
- goto out_mds;
+ return PNFS_NOT_ATTEMPTED;
par->pnfs_callback = bl_end_par_io_write;
- /* At this point, have to be more careful with error handling */
- isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
- be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
- if (!be || !is_writable(be, isect)) {
- dprintk("%s no matching extents!\n", __func__);
- goto out_mds;
- }
+ blk_start_plug(&plug);
- /* First page inside INVALID extent */
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (likely(!bl_push_one_short_extent(be->be_inval)))
- par->bse_count++;
- else
- goto out_mds;
- temp = offset >> PAGE_CACHE_SHIFT;
- npg_zero = do_div(temp, npg_per_block);
- isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
- (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
- extent_length = be->be_length - (isect - be->be_f_offset);
-
-fill_invalid_ext:
- dprintk("%s need to zero %d pages\n", __func__, npg_zero);
- for (;npg_zero > 0; npg_zero--) {
- if (bl_is_sector_init(be->be_inval, isect)) {
- dprintk("isect %llu already init\n",
- (unsigned long long)isect);
- goto next_page;
- }
- /* page ref released in bl_end_io_write_zero */
- index = isect >> PAGE_CACHE_SECTOR_SHIFT;
- dprintk("%s zero %dth page: index %lu isect %llu\n",
- __func__, npg_zero, index,
- (unsigned long long)isect);
- page = bl_find_get_zeroing_page(header->inode, index,
- cow_read);
- if (unlikely(IS_ERR(page))) {
- header->pnfs_error = PTR_ERR(page);
- goto out;
- } else if (page == NULL)
- goto next_page;
-
- ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS);
- if (unlikely(ret)) {
- dprintk("%s bl_mark_sectors_init fail %d\n",
- __func__, ret);
- end_page_writeback(page);
- page_cache_release(page);
- header->pnfs_error = ret;
- goto out;
- }
- if (likely(!bl_push_one_short_extent(be->be_inval)))
- par->bse_count++;
- else {
- end_page_writeback(page);
- page_cache_release(page);
- header->pnfs_error = -ENOMEM;
- goto out;
- }
- /* FIXME: This should be done in bi_end_io */
- mark_extents_written(BLK_LSEG2EXT(header->lseg),
- page->index << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE);
-
- bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
- isect, page, be,
- bl_end_io_write_zero, par);
- if (IS_ERR(bio)) {
- header->pnfs_error = PTR_ERR(bio);
- bio = NULL;
- goto out;
- }
-next_page:
- isect += PAGE_CACHE_SECTORS;
- extent_length -= PAGE_CACHE_SECTORS;
- }
- if (last)
- goto write_done;
- }
- bio = bl_submit_bio(WRITE, bio);
+ /* we always write out the whole page */
+ offset = offset & (loff_t)PAGE_CACHE_MASK;
+ isect = offset >> SECTOR_SHIFT;
- /* Middle pages */
- pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
- for (i = pg_index; i < wdata->pages.npages; i++) {
- if (!extent_length) {
+ for (i = pg_index; i < header->page_array.npages; i++) {
+ if (extent_length <= 0) {
/* We've used up the previous extent */
- bl_put_extent(be);
- bl_put_extent(cow_read);
bio = bl_submit_bio(WRITE, bio);
/* Get the next one */
- be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
- isect, &cow_read);
- if (!be || !is_writable(be, isect)) {
+ if (!ext_tree_lookup(bl, isect, &be, true)) {
header->pnfs_error = -EINVAL;
goto out;
}
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (likely(!bl_push_one_short_extent(
- be->be_inval)))
- par->bse_count++;
- else {
- header->pnfs_error = -ENOMEM;
- goto out;
- }
- }
- extent_length = be->be_length -
- (isect - be->be_f_offset);
- }
- dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
- pg_offset = offset & ~PAGE_CACHE_MASK;
- if (pg_offset + count > PAGE_CACHE_SIZE)
- pg_len = PAGE_CACHE_SIZE - pg_offset;
- else
- pg_len = count;
-
- saved_len = pg_len;
- if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
- !bl_is_sector_init(be->be_inval, isect)) {
- ret = bl_read_partial_page_sync(pages[i], cow_read,
- pg_offset, pg_len, true);
- if (ret) {
- dprintk("%s bl_read_partial_page_sync fail %d\n",
- __func__, ret);
- header->pnfs_error = ret;
- goto out;
- }
-
- ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS);
- if (unlikely(ret)) {
- dprintk("%s bl_mark_sectors_init fail %d\n",
- __func__, ret);
- header->pnfs_error = ret;
- goto out;
- }
-
- /* Expand to full page write */
- pg_offset = 0;
- pg_len = PAGE_CACHE_SIZE;
- } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
- (pg_len & (SECTOR_SIZE - 1))){
- /* ahh, nasty case. We have to do sync full sector
- * read-modify-write cycles.
- */
- unsigned int saved_offset = pg_offset;
- ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
- pg_len, false);
- pg_offset = round_down(pg_offset, SECTOR_SIZE);
- pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
- - pg_offset;
+ extent_length = be.be_length - (isect - be.be_f_offset);
}
-
- bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
- isect, pages[i], be,
+ pg_len = PAGE_CACHE_SIZE;
+ bio = do_add_page_to_bio(bio, header->page_array.npages - i,
+ WRITE, isect, pages[i], &map, &be,
bl_end_io_write, par,
- pg_offset, pg_len);
+ 0, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
- offset += saved_len;
- count -= saved_len;
- isect += PAGE_CACHE_SECTORS;
- last_isect = isect;
- extent_length -= PAGE_CACHE_SECTORS;
- }
- /* Last page inside INVALID extent */
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- bio = bl_submit_bio(WRITE, bio);
- temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
- npg_zero = npg_per_block - do_div(temp, npg_per_block);
- if (npg_zero < npg_per_block) {
- last = 1;
- goto fill_invalid_ext;
- }
+ offset += pg_len;
+ count -= pg_len;
+ isect += (pg_len >> SECTOR_SHIFT);
+ extent_length -= (pg_len >> SECTOR_SHIFT);
}
-write_done:
- wdata->res.count = wdata->args.count;
+ header->res.count = header->args.count;
out:
- bl_put_extent(be);
- bl_put_extent(cow_read);
bl_submit_bio(WRITE, bio);
+ blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
-out_mds:
- bl_put_extent(be);
- bl_put_extent(cow_read);
- kfree(par);
- return PNFS_NOT_ATTEMPTED;
-}
-
-/* FIXME - range ignored */
-static void
-release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
-{
- int i;
- struct pnfs_block_extent *be;
-
- spin_lock(&bl->bl_ext_lock);
- for (i = 0; i < EXTENT_LISTS; i++) {
- while (!list_empty(&bl->bl_extents[i])) {
- be = list_first_entry(&bl->bl_extents[i],
- struct pnfs_block_extent,
- be_node);
- list_del(&be->be_node);
- bl_put_extent(be);
- }
- }
- spin_unlock(&bl->bl_ext_lock);
-}
-
-static void
-release_inval_marks(struct pnfs_inval_markings *marks)
-{
- struct pnfs_inval_tracking *pos, *temp;
- struct pnfs_block_short_extent *se, *stemp;
-
- list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
- list_del(&pos->it_link);
- kfree(pos);
- }
-
- list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
- list_del(&se->bse_node);
- kfree(se);
- }
- return;
}
static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ int err;
dprintk("%s enter\n", __func__);
- release_extents(bl, NULL);
- release_inval_marks(&bl->bl_inval);
+
+ err = ext_tree_remove(bl, true, 0, LLONG_MAX);
+ WARN_ON(err);
+
kfree(bl);
}
@@ -961,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
bl = kzalloc(sizeof(*bl), gfp_flags);
if (!bl)
return NULL;
+
+ bl->bl_ext_rw = RB_ROOT;
+ bl->bl_ext_ro = RB_ROOT;
spin_lock_init(&bl->bl_ext_lock);
- INIT_LIST_HEAD(&bl->bl_extents[0]);
- INIT_LIST_HEAD(&bl->bl_extents[1]);
- INIT_LIST_HEAD(&bl->bl_commit);
- INIT_LIST_HEAD(&bl->bl_committing);
- bl->bl_count = 0;
- bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
- BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+
return &bl->bl_layout;
}
@@ -978,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
kfree(lseg);
}
-/* We pretty much ignore lseg, and store all data layout wide, so we
- * can correctly merge.
- */
-static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr,
- gfp_t gfp_flags)
-{
- struct pnfs_layout_segment *lseg;
- int status;
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+ u32 mode; /* R or RW */
+ u64 start; /* Expected start of next non-COW extent */
+ u64 inval; /* Start of INVAL coverage */
+ u64 cowread; /* End of COW read coverage */
+};
- dprintk("%s enter\n", __func__);
- lseg = kzalloc(sizeof(*lseg), gfp_flags);
- if (!lseg)
- return ERR_PTR(-ENOMEM);
- status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
- if (status) {
- /* We don't want to call the full-blown bl_free_lseg,
- * since on error extents were not touched.
- */
- kfree(lseg);
- return ERR_PTR(status);
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+ struct layout_verification *lv)
+{
+ if (lv->mode == IOMODE_READ) {
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA)
+ return -EIO;
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
}
- return lseg;
+ /* lv->mode == IOMODE_RW */
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ if (lv->cowread > lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ lv->inval = lv->start;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+ if (be->be_f_offset > lv->start)
+ return -EIO;
+ if (be->be_f_offset < lv->inval)
+ return -EIO;
+ if (be->be_f_offset < lv->cowread)
+ return -EIO;
+ /* It looks like you might want to min this with lv->start,
+ * but you really don't.
+ */
+ lv->inval = lv->inval + be->be_length;
+ lv->cowread = be->be_f_offset + be->be_length;
+ return 0;
+ } else
+ return -EIO;
}
-static void
-bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *arg)
+static int decode_sector_number(__be32 **rp, sector_t *sp)
{
- dprintk("%s enter\n", __func__);
- encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+ uint64_t s;
+
+ *rp = xdr_decode_hyper(*rp, &s);
+ if (s & 0x1ff) {
+ printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+ return -1;
+ }
+ *sp = s >> SECTOR_SHIFT;
+ return 0;
}
-static void
-bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+static int
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+ struct layout_verification *lv, struct list_head *extents,
+ gfp_t gfp_mask)
{
- struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
+ struct pnfs_block_extent *be;
+ struct nfs4_deviceid id;
+ int error;
+ __be32 *p;
- dprintk("%s enter\n", __func__);
- clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
-}
+ p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
+ if (!p)
+ return -EIO;
-static void free_blk_mountid(struct block_mount_id *mid)
-{
- if (mid) {
- struct pnfs_block_dev *dev, *tmp;
+ be = kzalloc(sizeof(*be), GFP_NOFS);
+ if (!be)
+ return -ENOMEM;
- /* No need to take bm_lock as we are last user freeing bm_devlist */
- list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
- list_del(&dev->bm_node);
- bl_free_block_dev(dev);
- }
- kfree(mid);
+ memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ error = -EIO;
+ be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+ lo->plh_lc_cred, gfp_mask);
+ if (!be->be_device)
+ goto out_free_be;
+
+ /*
+ * The next three values are read in as bytes, but stored in the
+ * extent structure in 512-byte granularity.
+ */
+ if (decode_sector_number(&p, &be->be_f_offset) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_length) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_v_offset) < 0)
+ goto out_put_deviceid;
+ be->be_state = be32_to_cpup(p++);
+
+ error = verify_extent(be, lv);
+ if (error) {
+ dprintk("%s: extent verification failed\n", __func__);
+ goto out_put_deviceid;
}
+
+ list_add_tail(&be->be_list, extents);
+ return 0;
+
+out_put_deviceid:
+ nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+ kfree(be);
+ return error;
}
-/* This is mostly copied from the filelayout_get_device_info function.
- * It seems much of this should be at the generic pnfs level.
- */
-static struct pnfs_block_dev *
-nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
- struct nfs4_deviceid *d_id)
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_mask)
{
- struct pnfs_device *dev;
- struct pnfs_block_dev *rv;
- u32 max_resp_sz;
- int max_pages;
- struct page **pages = NULL;
- int i, rc;
+ struct layout_verification lv = {
+ .mode = lgr->range.iomode,
+ .start = lgr->range.offset >> SECTOR_SHIFT,
+ .inval = lgr->range.offset >> SECTOR_SHIFT,
+ .cowread = lgr->range.offset >> SECTOR_SHIFT,
+ };
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ struct pnfs_layout_segment *lseg;
+ struct xdr_buf buf;
+ struct xdr_stream xdr;
+ struct page *scratch;
+ int status, i;
+ uint32_t count;
+ __be32 *p;
+ LIST_HEAD(extents);
+
+ dprintk("---> %s\n", __func__);
+
+ lseg = kzalloc(sizeof(*lseg), gfp_mask);
+ if (!lseg)
+ return ERR_PTR(-ENOMEM);
+
+ status = -ENOMEM;
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf,
+ lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+ status = -EIO;
+ p = xdr_inline_decode(&xdr, 4);
+ if (unlikely(!p))
+ goto out_free_scratch;
+
+ count = be32_to_cpup(p++);
+ dprintk("%s: number of extents %d\n", __func__, count);
/*
- * Use the session max response size as the basis for setting
- * GETDEVICEINFO's maxcount
+ * Decode individual extents, putting them in temporary staging area
+ * until whole layout is decoded to make error recovery easier.
*/
- max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
- max_pages = nfs_page_array_len(0, max_resp_sz);
- dprintk("%s max_resp_sz %u max_pages %d\n",
- __func__, max_resp_sz, max_pages);
-
- dev = kmalloc(sizeof(*dev), GFP_NOFS);
- if (!dev) {
- dprintk("%s kmalloc failed\n", __func__);
- return ERR_PTR(-ENOMEM);
+ for (i = 0; i < count; i++) {
+ status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
+ if (status)
+ goto process_extents;
}
- pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
+ if (lgr->range.offset + lgr->range.length !=
+ lv.start << SECTOR_SHIFT) {
+ dprintk("%s Final length mismatch\n", __func__);
+ status = -EIO;
+ goto process_extents;
}
- for (i = 0; i < max_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
- if (!pages[i]) {
- rv = ERR_PTR(-ENOMEM);
- goto out_free;
- }
+
+ if (lv.start < lv.cowread) {
+ dprintk("%s Final uncovered COW extent\n", __func__);
+ status = -EIO;
}
- memcpy(&dev->dev_id, d_id, sizeof(*d_id));
- dev->layout_type = LAYOUT_BLOCK_VOLUME;
- dev->pages = pages;
- dev->pgbase = 0;
- dev->pglen = PAGE_SIZE * max_pages;
- dev->mincount = 0;
- dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
- dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
- rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
- dprintk("%s getdevice info returns %d\n", __func__, rc);
- if (rc) {
- rv = ERR_PTR(rc);
- goto out_free;
+process_extents:
+ while (!list_empty(&extents)) {
+ struct pnfs_block_extent *be =
+ list_first_entry(&extents, struct pnfs_block_extent,
+ be_list);
+ list_del(&be->be_list);
+
+ if (!status)
+ status = ext_tree_insert(bl, be);
+
+ if (status) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
}
- rv = nfs4_blk_decode_device(server, dev);
- out_free:
- for (i = 0; i < max_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- kfree(dev);
- return rv;
+out_free_scratch:
+ __free_page(scratch);
+out:
+ dprintk("%s returns %d\n", __func__, status);
+ if (status) {
+ kfree(lseg);
+ return ERR_PTR(status);
+ }
+ return lseg;
}
-static int
-bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+static void
+bl_return_range(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
{
- struct block_mount_id *b_mt_id = NULL;
- struct pnfs_devicelist *dlist = NULL;
- struct pnfs_block_dev *bdev;
- LIST_HEAD(block_disklist);
- int status, i;
-
- dprintk("%s enter\n", __func__);
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ sector_t offset = range->offset >> SECTOR_SHIFT, end;
- if (server->pnfs_blksize == 0) {
- dprintk("%s Server did not return blksize\n", __func__);
- return -EINVAL;
- }
- b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
- if (!b_mt_id) {
- status = -ENOMEM;
- goto out_error;
- }
- /* Initialize nfs4 block layout mount id */
- spin_lock_init(&b_mt_id->bm_lock);
- INIT_LIST_HEAD(&b_mt_id->bm_devlist);
-
- dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
- if (!dlist) {
- status = -ENOMEM;
- goto out_error;
+ if (range->offset % 8) {
+ dprintk("%s: offset %lld not block size aligned\n",
+ __func__, range->offset);
+ return;
}
- dlist->eof = 0;
- while (!dlist->eof) {
- status = nfs4_proc_getdevicelist(server, fh, dlist);
- if (status)
- goto out_error;
- dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
- __func__, dlist->num_devs, dlist->eof);
- for (i = 0; i < dlist->num_devs; i++) {
- bdev = nfs4_blk_get_deviceinfo(server, fh,
- &dlist->dev_id[i]);
- if (IS_ERR(bdev)) {
- status = PTR_ERR(bdev);
- goto out_error;
- }
- spin_lock(&b_mt_id->bm_lock);
- list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
- spin_unlock(&b_mt_id->bm_lock);
+
+ if (range->length != NFS4_MAX_UINT64) {
+ if (range->length % 8) {
+ dprintk("%s: length %lld not block size aligned\n",
+ __func__, range->length);
+ return;
}
- }
- dprintk("%s SUCCESS\n", __func__);
- server->pnfs_ld_data = b_mt_id;
- out_return:
- kfree(dlist);
- return status;
+ end = offset + (range->length >> SECTOR_SHIFT);
+ } else {
+ end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
+ }
- out_error:
- free_blk_mountid(b_mt_id);
- goto out_return;
+ ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
}
static int
-bl_clear_layoutdriver(struct nfs_server *server)
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
{
- struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+ return ext_tree_prepare_commit(arg);
+}
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+{
+ ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
+}
+
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
dprintk("%s enter\n", __func__);
- free_blk_mountid(b_mt_id);
- dprintk("%s RETURNS\n", __func__);
+
+ if (server->pnfs_blksize == 0) {
+ dprintk("%s Server did not return blksize\n", __func__);
+ return -EINVAL;
+ }
+ if (server->pnfs_blksize > PAGE_SIZE) {
+ printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
+ __func__, server->pnfs_blksize);
+ return -EINVAL;
+ }
+
return 0;
}
static bool
-is_aligned_req(struct nfs_page *req, unsigned int alignment)
+is_aligned_req(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req, unsigned int alignment)
{
- return IS_ALIGNED(req->wb_offset, alignment) &&
- IS_ALIGNED(req->wb_bytes, alignment);
+ /*
+ * Always accept buffered writes, higher layers take care of the
+ * right alignment.
+ */
+ if (pgio->pg_dreq == NULL)
+ return true;
+
+ if (!IS_ALIGNED(req->wb_offset, alignment))
+ return false;
+
+ if (IS_ALIGNED(req->wb_bytes, alignment))
+ return true;
+
+ if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+ /*
+ * If the write goes up to the inode size, just write
+ * the full page. Data past the inode size is
+ * guaranteed to be zeroed by the higher level client
+ * code, and this behaviour is mandated by RFC 5663
+ * section 2.3.2.
+ */
+ return true;
+ }
+
+ return false;
}
static void
bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, SECTOR_SIZE))
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
nfs_pageio_reset_read_mds(pgio);
- else
- pnfs_generic_pg_init_read(pgio, req);
+ return;
+ }
+
+ pnfs_generic_pg_init_read(pgio, req);
}
/*
@@ -1197,10 +796,8 @@ static size_t
bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, SECTOR_SIZE))
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE))
return 0;
-
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -1230,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
static void
bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, PAGE_CACHE_SIZE)) {
+ u64 wb_size;
+
+ if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
nfs_pageio_reset_write_mds(pgio);
- } else {
- u64 wb_size;
- if (pgio->pg_dreq == NULL)
- wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
- req->wb_index);
- else
- wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-
- pnfs_generic_pg_init_write(pgio, req, wb_size);
+ return;
}
+
+ if (pgio->pg_dreq == NULL)
+ wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
+ req->wb_index);
+ else
+ wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+ pnfs_generic_pg_init_write(pgio, req, wb_size);
}
/*
@@ -1253,10 +851,8 @@ static size_t
bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, PAGE_CACHE_SIZE))
+ if (!is_aligned_req(pgio, req, PAGE_SIZE))
return 0;
-
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -1276,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.id = LAYOUT_BLOCK_VOLUME,
.name = "LAYOUT_BLOCK_VOLUME",
.owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_READ_WHOLE_PAGE,
.read_pagelist = bl_read_pagelist,
.write_pagelist = bl_write_pagelist,
.alloc_layout_hdr = bl_alloc_layout_hdr,
.free_layout_hdr = bl_free_layout_hdr,
.alloc_lseg = bl_alloc_lseg,
.free_lseg = bl_free_lseg,
- .encode_layoutcommit = bl_encode_layoutcommit,
+ .return_range = bl_return_range,
+ .prepare_layoutcommit = bl_prepare_layoutcommit,
.cleanup_layoutcommit = bl_cleanup_layoutcommit,
.set_layoutdriver = bl_set_layoutdriver,
- .clear_layoutdriver = bl_clear_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
.pg_read_ops = &bl_pg_read_ops,
.pg_write_ops = &bl_pg_write_ops,
};
-static const struct rpc_pipe_ops bl_upcall_ops = {
- .upcall = rpc_pipe_generic_upcall,
- .downcall = bl_pipe_downcall,
- .destroy_msg = bl_pipe_destroy_msg,
-};
-
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- struct dentry *dir, *dentry;
-
- dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
- if (dir == NULL)
- return ERR_PTR(-ENOENT);
- dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
- dput(dir);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- if (pipe->dentry)
- rpc_unlink(pipe->dentry);
-}
-
-static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
- void *ptr)
-{
- struct super_block *sb = ptr;
- struct net *net = sb->s_fs_info;
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
- int ret = 0;
-
- if (!try_module_get(THIS_MODULE))
- return 0;
-
- if (nn->bl_device_pipe == NULL) {
- module_put(THIS_MODULE);
- return 0;
- }
-
- switch (event) {
- case RPC_PIPEFS_MOUNT:
- dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- break;
- }
- nn->bl_device_pipe->dentry = dentry;
- break;
- case RPC_PIPEFS_UMOUNT:
- if (nn->bl_device_pipe->dentry)
- nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
- break;
- default:
- ret = -ENOTSUPP;
- break;
- }
- module_put(THIS_MODULE);
- return ret;
-}
-
-static struct notifier_block nfs4blocklayout_block = {
- .notifier_call = rpc_pipefs_event,
-};
-
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
- struct rpc_pipe *pipe)
-{
- struct super_block *pipefs_sb;
- struct dentry *dentry;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (!pipefs_sb)
- return NULL;
- dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
- rpc_put_sb_net(net);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_net(struct net *net,
- struct rpc_pipe *pipe)
-{
- struct super_block *pipefs_sb;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (pipefs_sb) {
- nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
- rpc_put_sb_net(net);
- }
-}
-
-static int nfs4blocklayout_net_init(struct net *net)
-{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
-
- init_waitqueue_head(&nn->bl_wq);
- nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
- if (IS_ERR(nn->bl_device_pipe))
- return PTR_ERR(nn->bl_device_pipe);
- dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- rpc_destroy_pipe_data(nn->bl_device_pipe);
- return PTR_ERR(dentry);
- }
- nn->bl_device_pipe->dentry = dentry;
- return 0;
-}
-
-static void nfs4blocklayout_net_exit(struct net *net)
-{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
- rpc_destroy_pipe_data(nn->bl_device_pipe);
- nn->bl_device_pipe = NULL;
-}
-
-static struct pernet_operations nfs4blocklayout_net_ops = {
- .init = nfs4blocklayout_net_init,
- .exit = nfs4blocklayout_net_exit,
-};
-
static int __init nfs4blocklayout_init(void)
{
int ret;
@@ -1425,20 +899,14 @@ static int __init nfs4blocklayout_init(void)
ret = pnfs_register_layoutdriver(&blocklayout_type);
if (ret)
goto out;
-
- ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ ret = bl_init_pipefs();
if (ret)
- goto out_remove;
- ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
- if (ret)
- goto out_notifier;
-out:
- return ret;
+ goto out_unregister;
+ return 0;
-out_notifier:
- rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
-out_remove:
+out_unregister:
pnfs_unregister_layoutdriver(&blocklayout_type);
+out:
return ret;
}
@@ -1447,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void)
dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
__func__);
- rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
- unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+ bl_cleanup_pipefs();
pnfs_unregister_layoutdriver(&blocklayout_type);
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb02047..92dca9e90d8 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,105 +44,112 @@
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
-struct block_mount_id {
- spinlock_t bm_lock; /* protects list */
- struct list_head bm_devlist; /* holds pnfs_block_dev */
-};
+struct pnfs_block_dev;
-struct pnfs_block_dev {
- struct list_head bm_node;
- struct nfs4_deviceid bm_mdevid; /* associated devid */
- struct block_device *bm_mdev; /* meta device itself */
- struct net *net;
+enum pnfs_block_volume_type {
+ PNFS_BLOCK_VOLUME_SIMPLE = 0,
+ PNFS_BLOCK_VOLUME_SLICE = 1,
+ PNFS_BLOCK_VOLUME_CONCAT = 2,
+ PNFS_BLOCK_VOLUME_STRIPE = 3,
};
-enum exstate4 {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
- PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
+#define PNFS_BLOCK_MAX_UUIDS 4
+#define PNFS_BLOCK_MAX_DEVICES 64
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN 128
+
+
+struct pnfs_block_volume {
+ enum pnfs_block_volume_type type;
+ union {
+ struct {
+ int len;
+ int nr_sigs;
+ struct {
+ u64 offset;
+ u32 sig_len;
+ u8 sig[PNFS_BLOCK_UUID_LEN];
+ } sigs[PNFS_BLOCK_MAX_UUIDS];
+ } simple;
+ struct {
+ u64 start;
+ u64 len;
+ u32 volume;
+ } slice;
+ struct {
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } concat;
+ struct {
+ u64 chunk_size;
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } stripe;
+ };
};
-#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+struct pnfs_block_dev_map {
+ sector_t start;
+ sector_t len;
-struct my_tree {
- sector_t mtt_step_size; /* Internal sector alignment */
- struct list_head mtt_stub; /* Should be a radix tree */
+ sector_t disk_offset;
+ struct block_device *bdev;
};
-struct pnfs_inval_markings {
- spinlock_t im_lock;
- struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
- sector_t im_block_size; /* Server blocksize in sectors */
- struct list_head im_extents; /* Short extents for INVAL->RW conversion */
+struct pnfs_block_dev {
+ struct nfs4_deviceid_node node;
+
+ u64 start;
+ u64 len;
+
+ u32 nr_children;
+ struct pnfs_block_dev *children;
+ u64 chunk_size;
+
+ struct block_device *bdev;
+ u64 disk_offset;
+
+ bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map);
};
-struct pnfs_inval_tracking {
- struct list_head it_link;
- int it_sector;
- int it_tags;
+enum exstate4 {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
+ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
};
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
- struct kref be_refcnt;
- struct list_head be_node; /* link into lseg list */
- struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
- struct block_device *be_mdev;
+ union {
+ struct rb_node be_node;
+ struct list_head be_list;
+ };
+ struct nfs4_deviceid_node *be_device;
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
enum exstate4 be_state; /* the state of this extent */
- struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+#define EXTENT_WRITTEN 1
+#define EXTENT_COMMITTING 2
+ unsigned int be_tag;
};
-/* Shortened extent used by LAYOUTCOMMIT */
-struct pnfs_block_short_extent {
- struct list_head bse_node;
- struct nfs4_deviceid bse_devid;
- struct block_device *bse_mdev;
- sector_t bse_f_offset; /* the starting offset in the file */
- sector_t bse_length; /* the size of the extent */
-};
-
-static inline void
-BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
-{
- spin_lock_init(&marks->im_lock);
- INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
- INIT_LIST_HEAD(&marks->im_extents);
- marks->im_block_size = blocksize;
- marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
- blocksize);
-}
-
-enum extentclass4 {
- RW_EXTENT = 0, /* READWRTE and INVAL */
- RO_EXTENT = 1, /* READ and NONE */
- EXTENT_LISTS = 2,
-};
-
-static inline int bl_choose_list(enum exstate4 state)
-{
- if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
- return RO_EXTENT;
- else
- return RW_EXTENT;
-}
+/* on the wire size of the extent */
+#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
struct pnfs_block_layout {
- struct pnfs_layout_hdr bl_layout;
- struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+ struct pnfs_layout_hdr bl_layout;
+ struct rb_root bl_ext_rw;
+ struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
- struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
- struct list_head bl_commit; /* Needs layout commit */
- struct list_head bl_committing; /* Layout committing */
- unsigned int bl_count; /* entries in bl_commit */
- sector_t bl_blocksize; /* Server blocksize in sectors */
};
-#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
-
static inline struct pnfs_block_layout *
BLK_LO2EXT(struct pnfs_layout_hdr *lo)
{
@@ -171,41 +178,27 @@ struct bl_msg_hdr {
#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
-/* blocklayoutdev.c */
-ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-void nfs4_blkdev_put(struct block_device *bdev);
-struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev);
-int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
-
-/* blocklayoutdm.c */
-void bl_free_block_dev(struct pnfs_block_dev *bdev);
-
-/* extents.c */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
- struct pnfs_block_extent **cow_read);
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length);
-void bl_put_extent(struct pnfs_block_extent *be);
-struct pnfs_block_extent *bl_alloc_extent(void);
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
-int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *arg);
-void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- const struct nfs4_layoutcommit_args *arg,
- int status);
-int bl_add_merge_extent(struct pnfs_block_layout *bl,
- struct pnfs_block_extent *new);
-int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length,
- struct pnfs_block_short_extent *new);
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
+/* dev.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
+/* extent_tree.c */
+int ext_tree_insert(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new);
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
+ sector_t end);
+int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len);
+bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw);
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
+void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
+
+/* rpc_pipefs.c */
+dev_t bl_resolve_deviceid(struct nfs_server *server,
+ struct pnfs_block_volume *b, gfp_t gfp_mask);
+int __init bl_init_pipefs(void);
+void __exit bl_cleanup_pipefs(void);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 04303b5c936..00000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayoutdev.c
- *
- * Device operations for the pnfs nfs4 file layout driver.
- *
- * Copyright (c) 2006 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Andy Adamson <andros@citi.umich.edu>
- * Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-#include <linux/module.h>
-#include <linux/buffer_head.h> /* __bread */
-
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-static int decode_sector_number(__be32 **rp, sector_t *sp)
-{
- uint64_t s;
-
- *rp = xdr_decode_hyper(*rp, &s);
- if (s & 0x1ff) {
- printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
- return -1;
- }
- *sp = s >> SECTOR_SHIFT;
- return 0;
-}
-
-/*
- * Release the block device
- */
-void nfs4_blkdev_put(struct block_device *bdev)
-{
- dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
- MINOR(bdev->bd_dev));
- blkdev_put(bdev, FMODE_READ);
-}
-
-ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
- size_t mlen)
-{
- struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
- nfs_net_id);
-
- if (mlen != sizeof (struct bl_dev_msg))
- return -EINVAL;
-
- if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
- return -EFAULT;
-
- wake_up(&nn->bl_wq);
-
- return mlen;
-}
-
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-{
- struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
-
- if (msg->errno >= 0)
- return;
- wake_up(bl_pipe_msg->bl_wq);
-}
-
-/*
- * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
- */
-struct pnfs_block_dev *
-nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev)
-{
- struct pnfs_block_dev *rv;
- struct block_device *bd = NULL;
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_MOUNT,
- .totallen = dev->mincount,
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- int offset, len, i, rc;
- struct net *net = server->nfs_client->cl_net;
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct bl_dev_msg *reply = &nn->bl_mount_reply;
-
- dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
- dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
- dev->mincount);
-
- bl_pipe_msg.bl_wq = &nn->bl_wq;
- memset(msg, 0, sizeof(*msg));
- msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
- if (!msg->data) {
- rv = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- len = dev->mincount;
- offset = sizeof(bl_msg);
- for (i = 0; len > 0; i++) {
- memcpy(&dataptr[offset], page_address(dev->pages[i]),
- len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
- len -= PAGE_CACHE_SIZE;
- offset += PAGE_CACHE_SIZE;
- }
- msg->len = sizeof(bl_msg) + dev->mincount;
-
- dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
- add_wait_queue(&nn->bl_wq, &wq);
- rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
- if (rc < 0) {
- remove_wait_queue(&nn->bl_wq, &wq);
- rv = ERR_PTR(rc);
- goto out;
- }
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nn->bl_wq, &wq);
-
- if (reply->status != BL_DEVICE_REQUEST_PROC) {
- dprintk("%s failed to open device: %d\n",
- __func__, reply->status);
- rv = ERR_PTR(-EINVAL);
- goto out;
- }
-
- bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
- FMODE_READ, NULL);
- if (IS_ERR(bd)) {
- dprintk("%s failed to open device : %ld\n", __func__,
- PTR_ERR(bd));
- rv = ERR_CAST(bd);
- goto out;
- }
-
- rv = kzalloc(sizeof(*rv), GFP_NOFS);
- if (!rv) {
- rv = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- rv->bm_mdev = bd;
- memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
- rv->net = net;
- dprintk("%s Created device %s with bd_block_size %u\n",
- __func__,
- bd->bd_disk->disk_name,
- bd->bd_block_size);
-
-out:
- kfree(msg->data);
- return rv;
-}
-
-/* Map deviceid returned by the server to constructed block_device */
-static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
- struct nfs4_deviceid *id)
-{
- struct block_device *rv = NULL;
- struct block_mount_id *mid;
- struct pnfs_block_dev *dev;
-
- dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
- mid = BLK_ID(lo);
- spin_lock(&mid->bm_lock);
- list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
- if (memcmp(id->data, dev->bm_mdevid.data,
- NFS4_DEVICEID4_SIZE) == 0) {
- rv = dev->bm_mdev;
- goto out;
- }
- }
- out:
- spin_unlock(&mid->bm_lock);
- dprintk("%s returning %p\n", __func__, rv);
- return rv;
-}
-
-/* Tracks info needed to ensure extents in layout obey constraints of spec */
-struct layout_verification {
- u32 mode; /* R or RW */
- u64 start; /* Expected start of next non-COW extent */
- u64 inval; /* Start of INVAL coverage */
- u64 cowread; /* End of COW read coverage */
-};
-
-/* Verify the extent meets the layout requirements of the pnfs-block draft,
- * section 2.3.1.
- */
-static int verify_extent(struct pnfs_block_extent *be,
- struct layout_verification *lv)
-{
- if (lv->mode == IOMODE_READ) {
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
- be->be_state == PNFS_BLOCK_INVALID_DATA)
- return -EIO;
- if (be->be_f_offset != lv->start)
- return -EIO;
- lv->start += be->be_length;
- return 0;
- }
- /* lv->mode == IOMODE_RW */
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
- if (be->be_f_offset != lv->start)
- return -EIO;
- if (lv->cowread > lv->start)
- return -EIO;
- lv->start += be->be_length;
- lv->inval = lv->start;
- return 0;
- } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (be->be_f_offset != lv->start)
- return -EIO;
- lv->start += be->be_length;
- return 0;
- } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
- if (be->be_f_offset > lv->start)
- return -EIO;
- if (be->be_f_offset < lv->inval)
- return -EIO;
- if (be->be_f_offset < lv->cowread)
- return -EIO;
- /* It looks like you might want to min this with lv->start,
- * but you really don't.
- */
- lv->inval = lv->inval + be->be_length;
- lv->cowread = be->be_f_offset + be->be_length;
- return 0;
- } else
- return -EIO;
-}
-
-/* XDR decode pnfs_block_layout4 structure */
-int
-nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
-{
- struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
- int i, status = -EIO;
- uint32_t count;
- struct pnfs_block_extent *be = NULL, *save;
- struct xdr_stream stream;
- struct xdr_buf buf;
- struct page *scratch;
- __be32 *p;
- struct layout_verification lv = {
- .mode = lgr->range.iomode,
- .start = lgr->range.offset >> SECTOR_SHIFT,
- .inval = lgr->range.offset >> SECTOR_SHIFT,
- .cowread = lgr->range.offset >> SECTOR_SHIFT,
- };
- LIST_HEAD(extents);
-
- dprintk("---> %s\n", __func__);
-
- scratch = alloc_page(gfp_flags);
- if (!scratch)
- return -ENOMEM;
-
- xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-
- p = xdr_inline_decode(&stream, 4);
- if (unlikely(!p))
- goto out_err;
-
- count = be32_to_cpup(p++);
-
- dprintk("%s enter, number of extents %i\n", __func__, count);
- p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
- if (unlikely(!p))
- goto out_err;
-
- /* Decode individual extents, putting them in temporary
- * staging area until whole layout is decoded to make error
- * recovery easier.
- */
- for (i = 0; i < count; i++) {
- be = bl_alloc_extent();
- if (!be) {
- status = -ENOMEM;
- goto out_err;
- }
- memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
- p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
- be->be_mdev = translate_devid(lo, &be->be_devid);
- if (!be->be_mdev)
- goto out_err;
-
- /* The next three values are read in as bytes,
- * but stored as 512-byte sector lengths
- */
- if (decode_sector_number(&p, &be->be_f_offset) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_length) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_v_offset) < 0)
- goto out_err;
- be->be_state = be32_to_cpup(p++);
- if (be->be_state == PNFS_BLOCK_INVALID_DATA)
- be->be_inval = &bl->bl_inval;
- if (verify_extent(be, &lv)) {
- dprintk("%s verify failed\n", __func__);
- goto out_err;
- }
- list_add_tail(&be->be_node, &extents);
- }
- if (lgr->range.offset + lgr->range.length !=
- lv.start << SECTOR_SHIFT) {
- dprintk("%s Final length mismatch\n", __func__);
- be = NULL;
- goto out_err;
- }
- if (lv.start < lv.cowread) {
- dprintk("%s Final uncovered COW extent\n", __func__);
- be = NULL;
- goto out_err;
- }
- /* Extents decoded properly, now try to merge them in to
- * existing layout extents.
- */
- spin_lock(&bl->bl_ext_lock);
- list_for_each_entry_safe(be, save, &extents, be_node) {
- list_del(&be->be_node);
- status = bl_add_merge_extent(bl, be);
- if (status) {
- spin_unlock(&bl->bl_ext_lock);
- /* This is a fairly catastrophic error, as the
- * entire layout extent lists are now corrupted.
- * We should have some way to distinguish this.
- */
- be = NULL;
- goto out_err;
- }
- }
- spin_unlock(&bl->bl_ext_lock);
- status = 0;
- out:
- __free_page(scratch);
- dprintk("%s returns %i\n", __func__, status);
- return status;
-
- out_err:
- bl_put_extent(be);
- while (!list_empty(&extents)) {
- be = list_first_entry(&extents, struct pnfs_block_extent,
- be_node);
- list_del(&be->be_node);
- bl_put_extent(be);
- }
- goto out;
-}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index 8999cfddd86..00000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayoutdm.c
- *
- * Module for the NFSv4.1 pNFS block layout driver.
- *
- * Copyright (c) 2007 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Fred Isaman <iisaman@umich.edu>
- * Andy Adamson <andros@citi.umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include <linux/genhd.h> /* gendisk - used in a dprintk*/
-#include <linux/sched.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-static void dev_remove(struct net *net, dev_t dev)
-{
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_dev_msg bl_umount_request;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_UMOUNT,
- .totallen = sizeof(bl_umount_request),
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- dprintk("Entering %s\n", __func__);
-
- bl_pipe_msg.bl_wq = &nn->bl_wq;
- memset(msg, 0, sizeof(*msg));
- msg->len = sizeof(bl_msg) + bl_msg.totallen;
- msg->data = kzalloc(msg->len, GFP_NOFS);
- if (!msg->data)
- goto out;
-
- memset(&bl_umount_request, 0, sizeof(bl_umount_request));
- bl_umount_request.major = MAJOR(dev);
- bl_umount_request.minor = MINOR(dev);
-
- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-
- add_wait_queue(&nn->bl_wq, &wq);
- if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
- remove_wait_queue(&nn->bl_wq, &wq);
- goto out;
- }
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nn->bl_wq, &wq);
-
-out:
- kfree(msg->data);
-}
-
-/*
- * Release meta device
- */
-static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
-{
- dprintk("%s Releasing\n", __func__);
- nfs4_blkdev_put(bdev->bm_mdev);
- dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
-}
-
-void bl_free_block_dev(struct pnfs_block_dev *bdev)
-{
- if (bdev) {
- if (bdev->bm_mdev) {
- dprintk("%s Removing DM device: %d:%d\n",
- __func__,
- MAJOR(bdev->bm_mdev->bd_dev),
- MINOR(bdev->bm_mdev->bd_dev));
- nfs4_blk_metadev_release(bdev);
- }
- kfree(bdev);
- }
-}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 00000000000..5aed4f98df4
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+ if (dev->nr_children) {
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++)
+ bl_free_device(&dev->children[i]);
+ kfree(dev->children);
+ } else {
+ if (dev->bdev)
+ blkdev_put(dev->bdev, FMODE_READ);
+ }
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ struct pnfs_block_dev *dev =
+ container_of(d, struct pnfs_block_dev, node);
+
+ bl_free_device(dev);
+ kfree(dev);
+}
+
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+ __be32 *p;
+ int i;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->type = be32_to_cpup(p++);
+
+ switch (b->type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->simple.nr_sigs = be32_to_cpup(p++);
+ if (!b->simple.nr_sigs) {
+ dprintk("no signature\n");
+ return -EIO;
+ }
+
+ b->simple.len = 4 + 4;
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+ b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+ if (!p)
+ return -EIO;
+ memcpy(&b->simple.sigs[i].sig, p,
+ b->simple.sigs[i].sig_len);
+
+ b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+ }
+ break;
+ case PNFS_BLOCK_VOLUME_SLICE:
+ p = xdr_inline_decode(xdr, 8 + 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->slice.start);
+ p = xdr_decode_hyper(p, &b->slice.len);
+ b->slice.volume = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->concat.volumes_count = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->concat.volumes_count; i++)
+ b->concat.volumes[i] = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+ b->stripe.volumes_count = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->stripe.volumes_count; i++)
+ b->stripe.volumes[i] = be32_to_cpup(p++);
+ break;
+ default:
+ dprintk("unknown volume type!\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ map->start = dev->start;
+ map->len = dev->len;
+ map->disk_offset = dev->disk_offset;
+ map->bdev = dev->bdev;
+ return true;
+}
+
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++) {
+ struct pnfs_block_dev *child = &dev->children[i];
+
+ if (child->start > offset ||
+ child->start + child->len <= offset)
+ continue;
+
+ child->map(child, offset - child->start, map);
+ return true;
+ }
+
+ dprintk("%s: ran off loop!\n", __func__);
+ return false;
+}
+
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ struct pnfs_block_dev *child;
+ u64 chunk;
+ u32 chunk_idx;
+ u64 disk_offset;
+
+ chunk = div_u64(offset, dev->chunk_size);
+ div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+
+ if (chunk_idx > dev->nr_children) {
+ dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+ __func__, chunk_idx, offset, dev->chunk_size);
+ /* error, should not happen */
+ return false;
+ }
+
+ /* truncate offset to the beginning of the stripe */
+ offset = chunk * dev->chunk_size;
+
+ /* disk offset of the stripe */
+ disk_offset = div_u64(offset, dev->nr_children);
+
+ child = &dev->children[chunk_idx];
+ child->map(child, disk_offset, map);
+
+ map->start += offset;
+ map->disk_offset += disk_offset;
+ map->len = dev->chunk_size;
+ return true;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+
+
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ dev_t dev;
+
+ dev = bl_resolve_deviceid(server, v, gfp_mask);
+ if (!dev)
+ return -EIO;
+
+ d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR(d->bdev)) {
+ printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+ MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
+ return PTR_ERR(d->bdev);
+ }
+
+
+ d->len = i_size_read(d->bdev->bd_inode);
+ d->map = bl_map_simple;
+
+ printk(KERN_INFO "pNFS: using block device %s\n",
+ d->bdev->bd_disk->disk_name);
+ return 0;
+}
+
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ int ret;
+
+ ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+ if (ret)
+ return ret;
+
+ d->disk_offset = v->slice.start;
+ d->len = v->slice.len;
+ return 0;
+}
+
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->concat.volumes_count,
+ sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->concat.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->concat.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ d->children[i].start += len;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->map = bl_map_concat;
+ return 0;
+}
+
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->stripe.volumes_count,
+ sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->stripe.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->stripe.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->chunk_size = v->stripe.chunk_size;
+ d->map = bl_map_stripe;
+ return 0;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ switch (volumes[idx].type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SLICE:
+ return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+ default:
+ dprintk("unsupported volume type: %d\n", volumes[idx].type);
+ return -EIO;
+ }
+}
+
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_mask)
+{
+ struct nfs4_deviceid_node *node = NULL;
+ struct pnfs_block_volume *volumes;
+ struct pnfs_block_dev *top;
+ struct xdr_stream xdr;
+ struct xdr_buf buf;
+ struct page *scratch;
+ int nr_volumes, ret, i;
+ __be32 *p;
+
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+ p = xdr_inline_decode(&xdr, sizeof(__be32));
+ if (!p)
+ goto out_free_scratch;
+ nr_volumes = be32_to_cpup(p++);
+
+ volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+ gfp_mask);
+ if (!volumes)
+ goto out_free_scratch;
+
+ for (i = 0; i < nr_volumes; i++) {
+ ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+ if (ret < 0)
+ goto out_free_volumes;
+ }
+
+ top = kzalloc(sizeof(*top), gfp_mask);
+ if (!top)
+ goto out_free_volumes;
+
+ ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+ if (ret) {
+ bl_free_device(top);
+ kfree(top);
+ goto out_free_volumes;
+ }
+
+ node = &top->node;
+ nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+
+out_free_volumes:
+ kfree(volumes);
+out_free_scratch:
+ __free_page(scratch);
+out:
+ return node;
+}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 00000000000..31d0b5e53df
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+
+#include <linux/vmalloc.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static inline struct pnfs_block_extent *
+ext_node(struct rb_node *node)
+{
+ return rb_entry(node, struct pnfs_block_extent, be_node);
+}
+
+static struct pnfs_block_extent *
+ext_tree_first(struct rb_root *root)
+{
+ struct rb_node *node = rb_first(root);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_prev(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_prev(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_next(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_next(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static inline sector_t
+ext_f_end(struct pnfs_block_extent *be)
+{
+ return be->be_f_offset + be->be_length;
+}
+
+static struct pnfs_block_extent *
+__ext_tree_search(struct rb_root *root, sector_t start)
+{
+ struct rb_node *node = root->rb_node;
+ struct pnfs_block_extent *be = NULL;
+
+ while (node) {
+ be = ext_node(node);
+ if (start < be->be_f_offset)
+ node = node->rb_left;
+ else if (start >= ext_f_end(be))
+ node = node->rb_right;
+ else
+ return be;
+ }
+
+ if (be) {
+ if (start < be->be_f_offset)
+ return be;
+
+ if (start >= ext_f_end(be))
+ return ext_tree_next(be);
+ }
+
+ return NULL;
+}
+
+static bool
+ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
+{
+ if (be1->be_state != be2->be_state)
+ return false;
+ if (be1->be_device != be2->be_device)
+ return false;
+
+ if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
+ return false;
+
+ if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
+ (be1->be_v_offset + be1->be_length != be2->be_v_offset))
+ return false;
+
+ if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
+ be1->be_tag != be2->be_tag)
+ return false;
+
+ return true;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ left->be_length += be->be_length;
+ rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ return left;
+ }
+
+ return be;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ be->be_length += right->be_length;
+ rb_erase(&right->be_node, root);
+ nfs4_put_deviceid_node(right->be_device);
+ kfree(right);
+ }
+
+ return be;
+}
+
+static void
+__ext_tree_insert(struct rb_root *root,
+ struct pnfs_block_extent *new, bool merge_ok)
+{
+ struct rb_node **p = &root->rb_node, *parent = NULL;
+ struct pnfs_block_extent *be;
+
+ while (*p) {
+ parent = *p;
+ be = ext_node(parent);
+
+ if (new->be_f_offset < be->be_f_offset) {
+ if (merge_ok && ext_can_merge(new, be)) {
+ be->be_f_offset = new->be_f_offset;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset = new->be_v_offset;
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_left(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_left;
+ } else if (new->be_f_offset >= ext_f_end(be)) {
+ if (merge_ok && ext_can_merge(be, new)) {
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_right(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->be_node, parent, p);
+ rb_insert_color(&new->be_node, root);
+ return;
+free_new:
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+}
+
+static int
+__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+{
+ struct pnfs_block_extent *be;
+ sector_t len1 = 0, len2 = 0;
+ sector_t orig_v_offset;
+ sector_t orig_len;
+
+ be = __ext_tree_search(root, start);
+ if (!be)
+ return 0;
+ if (be->be_f_offset >= end)
+ return 0;
+
+ orig_v_offset = be->be_v_offset;
+ orig_len = be->be_length;
+
+ if (start > be->be_f_offset)
+ len1 = start - be->be_f_offset;
+ if (ext_f_end(be) > end)
+ len2 = ext_f_end(be) - end;
+
+ if (len2 > 0) {
+ if (len1 > 0) {
+ struct pnfs_block_extent *new;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = len1;
+
+ new->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ new->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ new->be_length = len2;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, true);
+ } else {
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ be->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ be->be_length = len2;
+ }
+ } else {
+ if (len1 > 0) {
+ be->be_length = len1;
+ be = ext_tree_next(be);
+ }
+
+ while (be && ext_f_end(be) <= end) {
+ struct pnfs_block_extent *next = ext_tree_next(be);
+
+ rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ be = next;
+ }
+
+ if (be && be->be_f_offset < end) {
+ len1 = ext_f_end(be) - end;
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset += be->be_length - len1;
+ be->be_length = len1;
+ }
+ }
+
+ return 0;
+}
+
+int
+ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
+{
+ struct pnfs_block_extent *be;
+ struct rb_root *root;
+ int err = 0;
+
+ switch (new->be_state) {
+ case PNFS_BLOCK_READWRITE_DATA:
+ case PNFS_BLOCK_INVALID_DATA:
+ root = &bl->bl_ext_rw;
+ break;
+ case PNFS_BLOCK_READ_DATA:
+ case PNFS_BLOCK_NONE_DATA:
+ root = &bl->bl_ext_ro;
+ break;
+ default:
+ dprintk("invalid extent type\n");
+ return -EINVAL;
+ }
+
+ spin_lock(&bl->bl_ext_lock);
+retry:
+ be = __ext_tree_search(root, new->be_f_offset);
+ if (!be || be->be_f_offset >= ext_f_end(new)) {
+ __ext_tree_insert(root, new, true);
+ } else if (new->be_f_offset >= be->be_f_offset) {
+ if (ext_f_end(new) <= ext_f_end(be)) {
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+ } else {
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+ } else if (ext_f_end(new) <= ext_f_end(be)) {
+ new->be_length = be->be_f_offset - new->be_f_offset;
+ __ext_tree_insert(root, new, true);
+ } else {
+ struct pnfs_block_extent *split;
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
+ if (!split) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ split->be_length = be->be_f_offset - split->be_f_offset;
+ split->be_device = nfs4_get_deviceid(new->be_device);
+ __ext_tree_insert(root, split, true);
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ return err;
+}
+
+static bool
+__ext_tree_lookup(struct rb_root *root, sector_t isect,
+ struct pnfs_block_extent *ret)
+{
+ struct rb_node *node;
+ struct pnfs_block_extent *be;
+
+ node = root->rb_node;
+ while (node) {
+ be = ext_node(node);
+ if (isect < be->be_f_offset)
+ node = node->rb_left;
+ else if (isect >= ext_f_end(be))
+ node = node->rb_right;
+ else {
+ *ret = *be;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw)
+{
+ bool found = false;
+
+ spin_lock(&bl->bl_ext_lock);
+ if (!rw)
+ found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
+ if (!found)
+ found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
+ spin_unlock(&bl->bl_ext_lock);
+
+ return found;
+}
+
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
+ sector_t start, sector_t end)
+{
+ int err, err2;
+
+ spin_lock(&bl->bl_ext_lock);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ if (rw) {
+ err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+ if (!err)
+ err = err2;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ return err;
+}
+
+static int
+ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
+ sector_t split)
+{
+ struct pnfs_block_extent *new;
+ sector_t orig_len = be->be_length;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = split - be->be_f_offset;
+
+ new->be_f_offset = split;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ new->be_v_offset = be->be_v_offset + be->be_length;
+ new->be_length = orig_len - be->be_length;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, false);
+ return 0;
+}
+
+int
+ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len)
+{
+ struct rb_root *root = &bl->bl_ext_rw;
+ sector_t end = start + len;
+ struct pnfs_block_extent *be;
+ int err = 0;
+
+ spin_lock(&bl->bl_ext_lock);
+ /*
+ * First remove all COW extents or holes from written to range.
+ */
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ if (err)
+ goto out;
+
+ /*
+ * Then mark all invalid extents in the range as written to.
+ */
+ for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
+ if (be->be_f_offset >= end)
+ break;
+
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
+ continue;
+
+ if (be->be_f_offset < start) {
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ sector_t diff = start - be->be_f_offset;
+
+ left->be_length += diff;
+
+ be->be_f_offset += diff;
+ be->be_v_offset += diff;
+ be->be_length -= diff;
+ } else {
+ err = ext_tree_split(root, be, start);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (ext_f_end(be) > end) {
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ sector_t diff = end - be->be_f_offset;
+
+ be->be_length -= diff;
+
+ right->be_f_offset -= diff;
+ right->be_v_offset -= diff;
+ right->be_length += diff;
+ } else {
+ err = ext_tree_split(root, be, end);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (be->be_f_offset >= start && ext_f_end(be) <= end) {
+ be->be_tag = EXTENT_WRITTEN;
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ }
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ return err;
+}
+
+static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
+ size_t buffer_size)
+{
+ if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+ int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
+
+ for (i = 0; i < nr_pages; i++)
+ put_page(arg->layoutupdate_pages[i]);
+ kfree(arg->layoutupdate_pages);
+ } else {
+ put_page(arg->layoutupdate_page);
+ }
+}
+
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+ size_t buffer_size, size_t *count)
+{
+ struct pnfs_block_extent *be;
+ int ret = 0;
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_WRITTEN)
+ continue;
+
+ (*count)++;
+ if (*count * BL_EXTENT_SIZE > buffer_size) {
+ /* keep counting.. */
+ ret = -ENOSPC;
+ continue;
+ }
+
+ p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+ NFS4_DEVICEID4_SIZE);
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, 0LL);
+ *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+
+ be->be_tag = EXTENT_COMMITTING;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ return ret;
+}
+
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ size_t count = 0, buffer_size = PAGE_SIZE;
+ __be32 *start_p;
+ int ret;
+
+ dprintk("%s enter\n", __func__);
+
+ arg->layoutupdate_page = alloc_page(GFP_NOFS);
+ if (!arg->layoutupdate_page)
+ return -ENOMEM;
+ start_p = page_address(arg->layoutupdate_page);
+ arg->layoutupdate_pages = &arg->layoutupdate_page;
+
+retry:
+ ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+ if (unlikely(ret)) {
+ ext_tree_free_commitdata(arg, buffer_size);
+
+ buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ count = 0;
+
+ arg->layoutupdate_pages =
+ kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+ sizeof(struct page *), GFP_NOFS);
+ if (!arg->layoutupdate_pages)
+ return -ENOMEM;
+
+ start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
+ if (!start_p) {
+ kfree(arg->layoutupdate_pages);
+ return -ENOMEM;
+ }
+
+ goto retry;
+ }
+
+ *start_p = cpu_to_be32(count);
+ arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+
+ if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+ __be32 *p = start_p;
+ int i = 0;
+
+ for (p = start_p;
+ p < start_p + arg->layoutupdate_len;
+ p += PAGE_SIZE) {
+ arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+ }
+ }
+
+ dprintk("%s found %zu ranges\n", __func__, count);
+ return 0;
+}
+
+void
+ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ struct rb_root *root = &bl->bl_ext_rw;
+ struct pnfs_block_extent *be;
+
+ dprintk("%s status %d\n", __func__, status);
+
+ ext_tree_free_commitdata(arg, arg->layoutupdate_len);
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_COMMITTING)
+ continue;
+
+ if (status) {
+ /*
+ * Mark as written and try again.
+ *
+ * XXX: some real error handling here wouldn't hurt..
+ */
+ be->be_tag = EXTENT_WRITTEN;
+ } else {
+ be->be_state = PNFS_BLOCK_READWRITE_DATA;
+ be->be_tag = 0;
+ }
+
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ spin_unlock(&bl->bl_ext_lock);
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d016144256..00000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayout.h
- *
- * Module for the NFSv4.1 pNFS block layout driver.
- *
- * Copyright (c) 2006 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Andy Adamson <andros@citi.umich.edu>
- * Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include "blocklayout.h"
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-/* Bit numbers */
-#define EXTENT_INITIALIZED 0
-#define EXTENT_WRITTEN 1
-#define EXTENT_IN_COMMIT 2
-#define INTERNAL_EXISTS MY_MAX_TAGS
-#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
-
-/* Returns largest t<=s s.t. t%base==0 */
-static inline sector_t normalize(sector_t s, int base)
-{
- sector_t tmp = s; /* Since do_div modifies its argument */
- return s - sector_div(tmp, base);
-}
-
-static inline sector_t normalize_up(sector_t s, int base)
-{
- return normalize(s + base - 1, base);
-}
-
-/* Complete stub using list while determine API wanted */
-
-/* Returns tags, or negative */
-static int32_t _find_entry(struct my_tree *tree, u64 s)
-{
- struct pnfs_inval_tracking *pos;
-
- dprintk("%s(%llu) enter\n", __func__, s);
- list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector > s)
- continue;
- else if (pos->it_sector == s)
- return pos->it_tags & INTERNAL_MASK;
- else
- break;
- }
- return -ENOENT;
-}
-
-static inline
-int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
-{
- int32_t tags;
-
- dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
- s = normalize(s, tree->mtt_step_size);
- tags = _find_entry(tree, s);
- if ((tags < 0) || !(tags & (1 << tag)))
- return 0;
- else
- return 1;
-}
-
-/* Creates entry with tag, or if entry already exists, unions tag to it.
- * If storage is not NULL, newly created entry will use it.
- * Returns number of entries added, or negative on error.
- */
-static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
- struct pnfs_inval_tracking *storage)
-{
- int found = 0;
- struct pnfs_inval_tracking *pos;
-
- dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
- list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector > s)
- continue;
- else if (pos->it_sector == s) {
- found = 1;
- break;
- } else
- break;
- }
- if (found) {
- pos->it_tags |= (1 << tag);
- return 0;
- } else {
- struct pnfs_inval_tracking *new;
- new = storage;
- new->it_sector = s;
- new->it_tags = (1 << tag);
- list_add(&new->it_link, &pos->it_link);
- return 1;
- }
-}
-
-/* XXXX Really want option to not create */
-/* Over range, unions tag with existing entries, else creates entry with tag */
-static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
-{
- u64 i;
-
- dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
- for (i = normalize(s, tree->mtt_step_size); i < s + length;
- i += tree->mtt_step_size)
- if (_add_entry(tree, i, tag, NULL))
- return -ENOMEM;
- return 0;
-}
-
-/* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct pnfs_inval_markings *marks,
- u64 offset, u64 length)
-{
- u64 start, end, s;
- int count, i, used = 0, status = -ENOMEM;
- struct pnfs_inval_tracking **storage;
- struct my_tree *tree = &marks->im_tree;
-
- dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
- start = normalize(offset, tree->mtt_step_size);
- end = normalize_up(offset + length, tree->mtt_step_size);
- count = (int)(end - start) / (int)tree->mtt_step_size;
-
- /* Pre-malloc what memory we might need */
- storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
- if (!storage)
- return -ENOMEM;
- for (i = 0; i < count; i++) {
- storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
- GFP_NOFS);
- if (!storage[i])
- goto out_cleanup;
- }
-
- spin_lock_bh(&marks->im_lock);
- for (s = start; s < end; s += tree->mtt_step_size)
- used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
- spin_unlock_bh(&marks->im_lock);
-
- status = 0;
-
- out_cleanup:
- for (i = used; i < count; i++) {
- if (!storage[i])
- break;
- kfree(storage[i]);
- }
- kfree(storage);
- return status;
-}
-
-/* We are relying on page lock to serialize this */
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
-{
- int rv;
-
- spin_lock_bh(&marks->im_lock);
- rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
- spin_unlock_bh(&marks->im_lock);
- return rv;
-}
-
-/* Assume start, end already sector aligned */
-static int
-_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
-{
- struct pnfs_inval_tracking *pos;
- u64 expect = 0;
-
- dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
- list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector >= end)
- continue;
- if (!expect) {
- if ((pos->it_sector == end - tree->mtt_step_size) &&
- (pos->it_tags & (1 << tag))) {
- expect = pos->it_sector - tree->mtt_step_size;
- if (pos->it_sector < tree->mtt_step_size || expect < start)
- return 1;
- continue;
- } else {
- return 0;
- }
- }
- if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
- return 0;
- expect -= tree->mtt_step_size;
- if (expect < start)
- return 1;
- }
- return 0;
-}
-
-static int is_range_written(struct pnfs_inval_markings *marks,
- sector_t start, sector_t end)
-{
- int rv;
-
- spin_lock_bh(&marks->im_lock);
- rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
- spin_unlock_bh(&marks->im_lock);
- return rv;
-}
-
-/* Marks sectors in [offest, offset_length) as having been initialized.
- * All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Currently assumes offset is page-aligned
- */
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length)
-{
- sector_t start, end;
-
- dprintk("%s(offset=%llu,len=%llu) enter\n",
- __func__, (u64)offset, (u64)length);
-
- start = normalize(offset, marks->im_block_size);
- end = normalize_up(offset + length, marks->im_block_size);
- if (_preload_range(marks, start, end - start))
- goto outerr;
-
- spin_lock_bh(&marks->im_lock);
- if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
- goto out_unlock;
- spin_unlock_bh(&marks->im_lock);
-
- return 0;
-
-out_unlock:
- spin_unlock_bh(&marks->im_lock);
-outerr:
- return -ENOMEM;
-}
-
-/* Marks sectors in [offest, offset+length) as having been written to disk.
- * All lengths should be block aligned.
- */
-static int mark_written_sectors(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length)
-{
- int status;
-
- dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
- (u64)offset, (u64)length);
- spin_lock_bh(&marks->im_lock);
- status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
- spin_unlock_bh(&marks->im_lock);
- return status;
-}
-
-static void print_short_extent(struct pnfs_block_short_extent *be)
-{
- dprintk("PRINT SHORT EXTENT extent %p\n", be);
- if (be) {
- dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
- dprintk(" be_length %llu\n", (u64)be->bse_length);
- }
-}
-
-static void print_clist(struct list_head *list, unsigned int count)
-{
- struct pnfs_block_short_extent *be;
- unsigned int i = 0;
-
- ifdebug(FACILITY) {
- printk(KERN_DEBUG "****************\n");
- printk(KERN_DEBUG "Extent list looks like:\n");
- list_for_each_entry(be, list, bse_node) {
- i++;
- print_short_extent(be);
- }
- if (i != count)
- printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
- printk(KERN_DEBUG "****************\n");
- }
-}
-
-/* Note: In theory, we should do more checking that devid's match between
- * old and new, but if they don't, the lists are too corrupt to salvage anyway.
- */
-/* Note this is very similar to bl_add_merge_extent */
-static void add_to_commitlist(struct pnfs_block_layout *bl,
- struct pnfs_block_short_extent *new)
-{
- struct list_head *clist = &bl->bl_commit;
- struct pnfs_block_short_extent *old, *save;
- sector_t end = new->bse_f_offset + new->bse_length;
-
- dprintk("%s enter\n", __func__);
- print_short_extent(new);
- print_clist(clist, bl->bl_count);
- bl->bl_count++;
- /* Scan for proper place to insert, extending new to the left
- * as much as possible.
- */
- list_for_each_entry_safe(old, save, clist, bse_node) {
- if (new->bse_f_offset < old->bse_f_offset)
- break;
- if (end <= old->bse_f_offset + old->bse_length) {
- /* Range is already in list */
- bl->bl_count--;
- kfree(new);
- return;
- } else if (new->bse_f_offset <=
- old->bse_f_offset + old->bse_length) {
- /* new overlaps or abuts existing be */
- if (new->bse_mdev == old->bse_mdev) {
- /* extend new to fully replace old */
- new->bse_length += new->bse_f_offset -
- old->bse_f_offset;
- new->bse_f_offset = old->bse_f_offset;
- list_del(&old->bse_node);
- bl->bl_count--;
- kfree(old);
- }
- }
- }
- /* Note that if we never hit the above break, old will not point to a
- * valid extent. However, in that case &old->bse_node==list.
- */
- list_add_tail(&new->bse_node, &old->bse_node);
- /* Scan forward for overlaps. If we find any, extend new and
- * remove the overlapped extent.
- */
- old = list_prepare_entry(new, clist, bse_node);
- list_for_each_entry_safe_continue(old, save, clist, bse_node) {
- if (end < old->bse_f_offset)
- break;
- /* new overlaps or abuts old */
- if (new->bse_mdev == old->bse_mdev) {
- if (end < old->bse_f_offset + old->bse_length) {
- /* extend new to fully cover old */
- end = old->bse_f_offset + old->bse_length;
- new->bse_length = end - new->bse_f_offset;
- }
- list_del(&old->bse_node);
- bl->bl_count--;
- kfree(old);
- }
- }
- dprintk("%s: after merging\n", __func__);
- print_clist(clist, bl->bl_count);
-}
-
-/* Note the range described by offset, length is guaranteed to be contained
- * within be.
- * new will be freed, either by this function or add_to_commitlist if they
- * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
- */
-int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length,
- struct pnfs_block_short_extent *new)
-{
- sector_t new_end, end = offset + length;
- struct pnfs_block_layout *bl = container_of(be->be_inval,
- struct pnfs_block_layout,
- bl_inval);
-
- mark_written_sectors(be->be_inval, offset, length);
- /* We want to add the range to commit list, but it must be
- * block-normalized, and verified that the normalized range has
- * been entirely written to disk.
- */
- new->bse_f_offset = offset;
- offset = normalize(offset, bl->bl_blocksize);
- if (offset < new->bse_f_offset) {
- if (is_range_written(be->be_inval, offset, new->bse_f_offset))
- new->bse_f_offset = offset;
- else
- new->bse_f_offset = offset + bl->bl_blocksize;
- }
- new_end = normalize_up(end, bl->bl_blocksize);
- if (end < new_end) {
- if (is_range_written(be->be_inval, end, new_end))
- end = new_end;
- else
- end = new_end - bl->bl_blocksize;
- }
- if (end <= new->bse_f_offset) {
- kfree(new);
- return 0;
- }
- new->bse_length = end - new->bse_f_offset;
- new->bse_devid = be->be_devid;
- new->bse_mdev = be->be_mdev;
-
- spin_lock(&bl->bl_ext_lock);
- add_to_commitlist(bl, new);
- spin_unlock(&bl->bl_ext_lock);
- return 0;
-}
-
-static void print_bl_extent(struct pnfs_block_extent *be)
-{
- dprintk("PRINT EXTENT extent %p\n", be);
- if (be) {
- dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
- dprintk(" be_length %llu\n", (u64)be->be_length);
- dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
- dprintk(" be_state %d\n", be->be_state);
- }
-}
-
-static void
-destroy_extent(struct kref *kref)
-{
- struct pnfs_block_extent *be;
-
- be = container_of(kref, struct pnfs_block_extent, be_refcnt);
- dprintk("%s be=%p\n", __func__, be);
- kfree(be);
-}
-
-void
-bl_put_extent(struct pnfs_block_extent *be)
-{
- if (be) {
- dprintk("%s enter %p (%i)\n", __func__, be,
- atomic_read(&be->be_refcnt.refcount));
- kref_put(&be->be_refcnt, destroy_extent);
- }
-}
-
-struct pnfs_block_extent *bl_alloc_extent(void)
-{
- struct pnfs_block_extent *be;
-
- be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
- if (!be)
- return NULL;
- INIT_LIST_HEAD(&be->be_node);
- kref_init(&be->be_refcnt);
- be->be_inval = NULL;
- return be;
-}
-
-static void print_elist(struct list_head *list)
-{
- struct pnfs_block_extent *be;
- dprintk("****************\n");
- dprintk("Extent list looks like:\n");
- list_for_each_entry(be, list, be_node) {
- print_bl_extent(be);
- }
- dprintk("****************\n");
-}
-
-static inline int
-extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
-{
- /* Note this assumes new->be_f_offset >= old->be_f_offset */
- return (new->be_state == old->be_state) &&
- ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
- ((new->be_v_offset - old->be_v_offset ==
- new->be_f_offset - old->be_f_offset) &&
- new->be_mdev == old->be_mdev));
-}
-
-/* Adds new to appropriate list in bl, modifying new and removing existing
- * extents as appropriate to deal with overlaps.
- *
- * See bl_find_get_extent for list constraints.
- *
- * Refcount on new is already set. If end up not using it, or error out,
- * need to put the reference.
- *
- * bl->bl_ext_lock is held by caller.
- */
-int
-bl_add_merge_extent(struct pnfs_block_layout *bl,
- struct pnfs_block_extent *new)
-{
- struct pnfs_block_extent *be, *tmp;
- sector_t end = new->be_f_offset + new->be_length;
- struct list_head *list;
-
- dprintk("%s enter with be=%p\n", __func__, new);
- print_bl_extent(new);
- list = &bl->bl_extents[bl_choose_list(new->be_state)];
- print_elist(list);
-
- /* Scan for proper place to insert, extending new to the left
- * as much as possible.
- */
- list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
- if (new->be_f_offset >= be->be_f_offset + be->be_length)
- break;
- if (new->be_f_offset >= be->be_f_offset) {
- if (end <= be->be_f_offset + be->be_length) {
- /* new is a subset of existing be*/
- if (extents_consistent(be, new)) {
- dprintk("%s: new is subset, ignoring\n",
- __func__);
- bl_put_extent(new);
- return 0;
- } else {
- goto out_err;
- }
- } else {
- /* |<-- be -->|
- * |<-- new -->| */
- if (extents_consistent(be, new)) {
- /* extend new to fully replace be */
- new->be_length += new->be_f_offset -
- be->be_f_offset;
- new->be_f_offset = be->be_f_offset;
- new->be_v_offset = be->be_v_offset;
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else {
- goto out_err;
- }
- }
- } else if (end >= be->be_f_offset + be->be_length) {
- /* new extent overlap existing be */
- if (extents_consistent(be, new)) {
- /* extend new to fully replace be */
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else {
- goto out_err;
- }
- } else if (end > be->be_f_offset) {
- /* |<-- be -->|
- *|<-- new -->| */
- if (extents_consistent(new, be)) {
- /* extend new to fully replace be */
- new->be_length += be->be_f_offset + be->be_length -
- new->be_f_offset - new->be_length;
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else {
- goto out_err;
- }
- }
- }
- /* Note that if we never hit the above break, be will not point to a
- * valid extent. However, in that case &be->be_node==list.
- */
- list_add(&new->be_node, &be->be_node);
- dprintk("%s: inserting new\n", __func__);
- print_elist(list);
- /* FIXME - The per-list consistency checks have all been done,
- * should now check cross-list consistency.
- */
- return 0;
-
- out_err:
- bl_put_extent(new);
- return -EIO;
-}
-
-/* Returns extent, or NULL. If a second READ extent exists, it is returned
- * in cow_read, if given.
- *
- * The extents are kept in two seperate ordered lists, one for READ and NONE,
- * one for READWRITE and INVALID. Within each list, we assume:
- * 1. Extents are ordered by file offset.
- * 2. For any given isect, there is at most one extents that matches.
- */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
- struct pnfs_block_extent **cow_read)
-{
- struct pnfs_block_extent *be, *cow, *ret;
- int i;
-
- dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
- cow = ret = NULL;
- spin_lock(&bl->bl_ext_lock);
- for (i = 0; i < EXTENT_LISTS; i++) {
- list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
- if (isect >= be->be_f_offset + be->be_length)
- break;
- if (isect >= be->be_f_offset) {
- /* We have found an extent */
- dprintk("%s Get %p (%i)\n", __func__, be,
- atomic_read(&be->be_refcnt.refcount));
- kref_get(&be->be_refcnt);
- if (!ret)
- ret = be;
- else if (be->be_state != PNFS_BLOCK_READ_DATA)
- bl_put_extent(be);
- else
- cow = be;
- break;
- }
- }
- if (ret &&
- (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
- break;
- }
- spin_unlock(&bl->bl_ext_lock);
- if (cow_read)
- *cow_read = cow;
- print_bl_extent(ret);
- return ret;
-}
-
-/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
-static struct pnfs_block_extent *
-bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
-{
- struct pnfs_block_extent *be, *ret = NULL;
- int i;
-
- dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
- for (i = 0; i < EXTENT_LISTS; i++) {
- if (ret)
- break;
- list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
- if (isect >= be->be_f_offset + be->be_length)
- break;
- if (isect >= be->be_f_offset) {
- /* We have found an extent */
- dprintk("%s Get %p (%i)\n", __func__, be,
- atomic_read(&be->be_refcnt.refcount));
- kref_get(&be->be_refcnt);
- ret = be;
- break;
- }
- }
- }
- print_bl_extent(ret);
- return ret;
-}
-
-int
-encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *arg)
-{
- struct pnfs_block_short_extent *lce, *save;
- unsigned int count = 0;
- __be32 *p, *xdr_start;
-
- dprintk("%s enter\n", __func__);
- /* BUG - creation of bl_commit is buggy - need to wait for
- * entire block to be marked WRITTEN before it can be added.
- */
- spin_lock(&bl->bl_ext_lock);
- /* Want to adjust for possible truncate */
- /* We now want to adjust argument range */
-
- /* XDR encode the ranges found */
- xdr_start = xdr_reserve_space(xdr, 8);
- if (!xdr_start)
- goto out;
- list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
- p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
- if (!p)
- break;
- p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
- p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, 0LL);
- *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
- list_move_tail(&lce->bse_node, &bl->bl_committing);
- bl->bl_count--;
- count++;
- }
- xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
- xdr_start[1] = cpu_to_be32(count);
-out:
- spin_unlock(&bl->bl_ext_lock);
- dprintk("%s found %i ranges\n", __func__, count);
- return 0;
-}
-
-/* Helper function to set_to_rw that initialize a new extent */
-static void
-_prep_new_extent(struct pnfs_block_extent *new,
- struct pnfs_block_extent *orig,
- sector_t offset, sector_t length, int state)
-{
- kref_init(&new->be_refcnt);
- /* don't need to INIT_LIST_HEAD(&new->be_node) */
- memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
- new->be_mdev = orig->be_mdev;
- new->be_f_offset = offset;
- new->be_length = length;
- new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
- new->be_state = state;
- new->be_inval = orig->be_inval;
-}
-
-/* Tries to merge be with extent in front of it in list.
- * Frees storage if not used.
- */
-static struct pnfs_block_extent *
-_front_merge(struct pnfs_block_extent *be, struct list_head *head,
- struct pnfs_block_extent *storage)
-{
- struct pnfs_block_extent *prev;
-
- if (!storage)
- goto no_merge;
- if (&be->be_node == head || be->be_node.prev == head)
- goto no_merge;
- prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
- if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
- !extents_consistent(prev, be))
- goto no_merge;
- _prep_new_extent(storage, prev, prev->be_f_offset,
- prev->be_length + be->be_length, prev->be_state);
- list_replace(&prev->be_node, &storage->be_node);
- bl_put_extent(prev);
- list_del(&be->be_node);
- bl_put_extent(be);
- return storage;
-
- no_merge:
- kfree(storage);
- return be;
-}
-
-static u64
-set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
-{
- u64 rv = offset + length;
- struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
- struct pnfs_block_extent *children[3];
- struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
- int i = 0, j;
-
- dprintk("%s(%llu, %llu)\n", __func__, offset, length);
- /* Create storage for up to three new extents e1, e2, e3 */
- e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
- e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
- e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
- /* BUG - we are ignoring any failure */
- if (!e1 || !e2 || !e3)
- goto out_nosplit;
-
- spin_lock(&bl->bl_ext_lock);
- be = bl_find_get_extent_locked(bl, offset);
- rv = be->be_f_offset + be->be_length;
- if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
- spin_unlock(&bl->bl_ext_lock);
- goto out_nosplit;
- }
- /* Add e* to children, bumping e*'s krefs */
- if (be->be_f_offset != offset) {
- _prep_new_extent(e1, be, be->be_f_offset,
- offset - be->be_f_offset,
- PNFS_BLOCK_INVALID_DATA);
- children[i++] = e1;
- print_bl_extent(e1);
- } else
- merge1 = e1;
- _prep_new_extent(e2, be, offset,
- min(length, be->be_f_offset + be->be_length - offset),
- PNFS_BLOCK_READWRITE_DATA);
- children[i++] = e2;
- print_bl_extent(e2);
- if (offset + length < be->be_f_offset + be->be_length) {
- _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
- be->be_f_offset + be->be_length -
- offset - length,
- PNFS_BLOCK_INVALID_DATA);
- children[i++] = e3;
- print_bl_extent(e3);
- } else
- merge2 = e3;
-
- /* Remove be from list, and insert the e* */
- /* We don't get refs on e*, since this list is the base reference
- * set when init'ed.
- */
- if (i < 3)
- children[i] = NULL;
- new = children[0];
- list_replace(&be->be_node, &new->be_node);
- bl_put_extent(be);
- new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
- for (j = 1; j < i; j++) {
- old = new;
- new = children[j];
- list_add(&new->be_node, &old->be_node);
- }
- if (merge2) {
- /* This is a HACK, should just create a _back_merge function */
- new = list_entry(new->be_node.next,
- struct pnfs_block_extent, be_node);
- new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
- }
- spin_unlock(&bl->bl_ext_lock);
-
- /* Since we removed the base reference above, be is now scheduled for
- * destruction.
- */
- bl_put_extent(be);
- dprintk("%s returns %llu after split\n", __func__, rv);
- return rv;
-
- out_nosplit:
- kfree(e1);
- kfree(e2);
- kfree(e3);
- dprintk("%s returns %llu without splitting\n", __func__, rv);
- return rv;
-}
-
-void
-clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- const struct nfs4_layoutcommit_args *arg,
- int status)
-{
- struct pnfs_block_short_extent *lce, *save;
-
- dprintk("%s status %d\n", __func__, status);
- list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
- if (likely(!status)) {
- u64 offset = lce->bse_f_offset;
- u64 end = offset + lce->bse_length;
-
- do {
- offset = set_to_rw(bl, offset, end - offset);
- } while (offset < end);
- list_del(&lce->bse_node);
-
- kfree(lce);
- } else {
- list_del(&lce->bse_node);
- spin_lock(&bl->bl_ext_lock);
- add_to_commitlist(bl, lce);
- spin_unlock(&bl->bl_ext_lock);
- }
- }
-}
-
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
-{
- struct pnfs_block_short_extent *new;
-
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (unlikely(!new))
- return -ENOMEM;
-
- spin_lock_bh(&marks->im_lock);
- list_add(&new->bse_node, &marks->im_extents);
- spin_unlock_bh(&marks->im_lock);
-
- return 0;
-}
-
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
-{
- struct pnfs_block_short_extent *rv = NULL;
-
- spin_lock_bh(&marks->im_lock);
- if (!list_empty(&marks->im_extents)) {
- rv = list_entry((&marks->im_extents)->next,
- struct pnfs_block_short_extent, bse_node);
- list_del_init(&rv->bse_node);
- }
- spin_unlock_bh(&marks->im_lock);
-
- return rv;
-}
-
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
-{
- struct pnfs_block_short_extent *se = NULL, *tmp;
-
- if (num_to_free <= 0)
- return;
-
- spin_lock(&marks->im_lock);
- list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
- list_del(&se->bse_node);
- kfree(se);
- if (--num_to_free == 0)
- break;
- }
- spin_unlock(&marks->im_lock);
-
- BUG_ON(num_to_free > 0);
-}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 00000000000..acbf9ca4018
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
+{
+ int i;
+
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(b->simple.nr_sigs);
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+ p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+ b->simple.sigs[i].sig_len);
+ }
+}
+
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
+ gfp_t gfp_mask)
+{
+ struct net *net = server->nfs_client->cl_net;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct bl_dev_msg *reply = &nn->bl_mount_reply;
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+ struct bl_msg_hdr *bl_msg;
+ DECLARE_WAITQUEUE(wq, current);
+ dev_t dev = 0;
+ int rc;
+
+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+
+ mutex_lock(&nn->bl_mutex);
+ bl_pipe_msg.bl_wq = &nn->bl_wq;
+
+ b->simple.len += 4; /* single volume */
+ if (b->simple.len > PAGE_SIZE)
+ goto out_unlock;
+
+ memset(msg, 0, sizeof(*msg));
+ msg->len = sizeof(*bl_msg) + b->simple.len;
+ msg->data = kzalloc(msg->len, gfp_mask);
+ if (!msg->data)
+ goto out_free_data;
+
+ bl_msg = msg->data;
+ bl_msg->type = BL_DEVICE_MOUNT,
+ bl_msg->totallen = b->simple.len;
+ nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
+
+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ add_wait_queue(&nn->bl_wq, &wq);
+ rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+ if (rc < 0) {
+ remove_wait_queue(&nn->bl_wq, &wq);
+ goto out_free_data;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ remove_wait_queue(&nn->bl_wq, &wq);
+
+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
+ printk(KERN_WARNING "%s failed to decode device: %d\n",
+ __func__, reply->status);
+ goto out_free_data;
+ }
+
+ dev = MKDEV(reply->major, reply->minor);
+out_free_data:
+ kfree(msg->data);
+out_unlock:
+ mutex_unlock(&nn->bl_mutex);
+ return dev;
+}
+
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+ size_t mlen)
+{
+ struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+ nfs_net_id);
+
+ if (mlen != sizeof (struct bl_dev_msg))
+ return -EINVAL;
+
+ if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+ return -EFAULT;
+
+ wake_up(&nn->bl_wq);
+
+ return mlen;
+}
+
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct bl_pipe_msg *bl_pipe_msg =
+ container_of(msg, struct bl_pipe_msg, msg);
+
+ if (msg->errno >= 0)
+ return;
+ wake_up(bl_pipe_msg->bl_wq);
+}
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = bl_pipe_destroy_msg,
+};
+
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+ if (dir == NULL)
+ return ERR_PTR(-ENOENT);
+ dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+ dput(dir);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ if (nn->bl_device_pipe == NULL) {
+ module_put(THIS_MODULE);
+ return 0;
+ }
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ break;
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (nn->bl_device_pipe->dentry)
+ nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+ struct dentry *dentry;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (!pipefs_sb)
+ return NULL;
+ dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+
+ mutex_init(&nn->bl_mutex);
+ init_waitqueue_head(&nn->bl_wq);
+ nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+ if (IS_ERR(nn->bl_device_pipe))
+ return PTR_ERR(nn->bl_device_pipe);
+ dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ return PTR_ERR(dentry);
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+ .init = nfs4blocklayout_net_init,
+ .exit = nfs4blocklayout_net_exit,
+};
+
+int __init bl_init_pipefs(void)
+{
+ int ret;
+
+ ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ if (ret)
+ goto out;
+ ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+ if (ret)
+ goto out_unregister_notifier;
+ return 0;
+
+out_unregister_notifier:
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+ return ret;
+}
+
+void __exit bl_cleanup_pipefs(void)
+{
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+ unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 073b4cf67ed..b8fb3a4ef64 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -235,7 +235,7 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
cb_info->serv = serv;
cb_info->rqst = rqstp;
- cb_info->task = kthread_run(callback_svc, cb_info->rqst,
+ cb_info->task = kthread_create(callback_svc, cb_info->rqst,
"nfsv4.%u-svc", minorversion);
if (IS_ERR(cb_info->task)) {
ret = PTR_ERR(cb_info->task);
@@ -244,6 +244,8 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
cb_info->task = NULL;
return ret;
}
+ rqstp->rq_task = cb_info->task;
+ wake_up_process(cb_info->task);
dprintk("nfs_callback_up: service started\n");
return 0;
}
@@ -428,6 +430,18 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
if (p == NULL)
return 0;
+ /*
+ * Did we get the acceptor from userland during the SETCLIENID
+ * negotiation?
+ */
+ if (clp->cl_acceptor)
+ return !strcmp(p, clp->cl_acceptor);
+
+ /*
+ * Otherwise try to verify it using the cl_hostname. Note that this
+ * doesn't work if a non-canonical hostname was used in the devname.
+ */
+
/* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
if (memcmp(p, "nfs@", 4) != 0)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a..73466b93409 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto out;
ino = lo->plh_inode;
+
+ spin_lock(&ino->i_lock);
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ spin_unlock(&ino->i_lock);
+
+ pnfs_layoutcommit_inode(ino, false);
+
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
- &args->cbl_range))
+ &args->cbl_range)) {
rv = NFS4ERR_DELAY;
- else
- rv = NFS4ERR_NOMATCHING_LAYOUT;
- pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ goto unlock;
+ }
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+ &args->cbl_range);
+ }
+unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
pnfs_put_layout_hdr(lo);
@@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
}
found:
- if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
- dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
- "deleting instead\n", __func__);
nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1d09289c8f0..f9f4845db98 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -110,8 +110,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
mutex_unlock(&nfs_version_mutex);
}
- if (!IS_ERR(nfs))
- try_module_get(nfs->owner);
+ if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
+ return ERR_PTR(-EAGAIN);
return nfs;
}
@@ -158,7 +158,8 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
goto error_0;
clp->cl_nfs_mod = cl_init->nfs_mod;
- try_module_get(clp->cl_nfs_mod->owner);
+ if (!try_module_get(clp->cl_nfs_mod->owner))
+ goto error_dealloc;
clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
@@ -190,6 +191,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
error_cleanup:
put_nfs_version(clp->cl_nfs_mod);
+error_dealloc:
kfree(clp);
error_0:
return ERR_PTR(err);
@@ -252,6 +254,7 @@ void nfs_free_client(struct nfs_client *clp)
put_net(clp->cl_net);
put_nfs_version(clp->cl_nfs_mod);
kfree(clp->cl_hostname);
+ kfree(clp->cl_acceptor);
kfree(clp);
dprintk("<-- nfs_free_client()\n");
@@ -482,8 +485,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
+ if (cl_init->hostname == NULL) {
+ WARN_ON(1);
+ return NULL;
+ }
+
dprintk("--> nfs_get_client(%s,v%u)\n",
- cl_init->hostname ?: "", rpc_ops->version);
+ cl_init->hostname, rpc_ops->version);
/* see if the client already exists */
do {
@@ -510,7 +518,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
} while (!IS_ERR(new));
dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
- cl_init->hostname ?: "", PTR_ERR(new));
+ cl_init->hostname, PTR_ERR(new));
return new;
}
EXPORT_SYMBOL_GPL(nfs_get_client);
@@ -1205,7 +1213,7 @@ static const struct file_operations nfs_server_list_fops = {
.open = nfs_server_list_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
.owner = THIS_MODULE,
};
@@ -1226,7 +1234,7 @@ static const struct file_operations nfs_volume_list_fops = {
.open = nfs_volume_list_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
.owner = THIS_MODULE,
};
@@ -1236,27 +1244,17 @@ static const struct file_operations nfs_volume_list_fops = {
*/
static int nfs_server_list_open(struct inode *inode, struct file *file)
{
- struct seq_file *m;
- int ret;
- struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
- struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
-
- ret = seq_open(file, &nfs_server_list_ops);
- if (ret < 0)
- return ret;
-
- m = file->private_data;
- m->private = net;
-
- return 0;
+ return seq_open_net(inode, file, &nfs_server_list_ops,
+ sizeof(struct seq_net_private));
}
/*
* set up the iterator to start reading from the server list and return the first item
*/
static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
{
- struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* lock the list against modification */
spin_lock(&nn->nfs_client_lock);
@@ -1268,7 +1266,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
*/
static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
{
- struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
return seq_list_next(v, &nn->nfs_client_list, pos);
}
@@ -1277,8 +1275,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
* clean up after reading from the transports list
*/
static void nfs_server_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
{
- struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
spin_unlock(&nn->nfs_client_lock);
}
@@ -1289,7 +1288,7 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)
static int nfs_server_list_show(struct seq_file *m, void *v)
{
struct nfs_client *clp;
- struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* display header on line 1 */
if (v == &nn->nfs_client_list) {
@@ -1321,27 +1320,17 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
*/
static int nfs_volume_list_open(struct inode *inode, struct file *file)
{
- struct seq_file *m;
- int ret;
- struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
- struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
-
- ret = seq_open(file, &nfs_volume_list_ops);
- if (ret < 0)
- return ret;
-
- m = file->private_data;
- m->private = net;
-
- return 0;
+ return seq_open_net(inode, file, &nfs_volume_list_ops,
+ sizeof(struct seq_net_private));
}
/*
* set up the iterator to start reading from the volume list and return the first item
*/
static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
{
- struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* lock the list against modification */
spin_lock(&nn->nfs_client_lock);
@@ -1353,7 +1342,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
*/
static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
{
- struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
return seq_list_next(v, &nn->nfs_volume_list, pos);
}
@@ -1362,8 +1351,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
* clean up after reading from the transports list
*/
static void nfs_volume_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
{
- struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
spin_unlock(&nn->nfs_client_lock);
}
@@ -1376,7 +1366,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
struct nfs_server *server;
struct nfs_client *clp;
char dev[8], fsid[17];
- struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* display header on line 1 */
if (v == &nn->nfs_volume_list) {
@@ -1407,6 +1397,39 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
return 0;
}
+int nfs_fs_proc_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct proc_dir_entry *p;
+
+ nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net);
+ if (!nn->proc_nfsfs)
+ goto error_0;
+
+ /* a file of servers with which we're dealing */
+ p = proc_create("servers", S_IFREG|S_IRUGO,
+ nn->proc_nfsfs, &nfs_server_list_fops);
+ if (!p)
+ goto error_1;
+
+ /* a file of volumes that we have mounted */
+ p = proc_create("volumes", S_IFREG|S_IRUGO,
+ nn->proc_nfsfs, &nfs_volume_list_fops);
+ if (!p)
+ goto error_1;
+ return 0;
+
+error_1:
+ remove_proc_subtree("nfsfs", net->proc_net);
+error_0:
+ return -ENOMEM;
+}
+
+void nfs_fs_proc_net_exit(struct net *net)
+{
+ remove_proc_subtree("nfsfs", net->proc_net);
+}
+
/*
* initialise the /proc/fs/nfsfs/ directory
*/
@@ -1419,14 +1442,12 @@ int __init nfs_fs_proc_init(void)
goto error_0;
/* a file of servers with which we're dealing */
- p = proc_create("servers", S_IFREG|S_IRUGO,
- proc_fs_nfs, &nfs_server_list_fops);
+ p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
if (!p)
goto error_1;
/* a file of volumes that we have mounted */
- p = proc_create("volumes", S_IFREG|S_IRUGO,
- proc_fs_nfs, &nfs_volume_list_fops);
+ p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
if (!p)
goto error_2;
return 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5d8ccecf5f5..7f3f6064134 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -41,14 +41,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
}
-/**
- * nfs_have_delegation - check if inode has a delegation
- * @inode: inode to check
- * @flags: delegation types to check for
- *
- * Returns one if inode has the indicated delegation, otherwise zero.
- */
-int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+static int
+nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
{
struct nfs_delegation *delegation;
int ret = 0;
@@ -58,12 +52,34 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation != NULL && (delegation->type & flags) == flags &&
!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
- nfs_mark_delegation_referenced(delegation);
+ if (mark)
+ nfs_mark_delegation_referenced(delegation);
ret = 1;
}
rcu_read_unlock();
return ret;
}
+/**
+ * nfs_have_delegation - check if inode has a delegation, mark it
+ * NFS_DELEGATION_REFERENCED if there is one.
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+{
+ return nfs4_do_check_delegation(inode, flags, true);
+}
+
+/*
+ * nfs4_check_delegation - check if inode has a delegation, do not mark
+ * NFS_DELEGATION_REFERENCED if it has one.
+ */
+int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+{
+ return nfs4_do_check_delegation(inode, flags, false);
+}
static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
{
@@ -109,6 +125,8 @@ again:
continue;
if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
if (!nfs4_stateid_match(&state->stateid, stateid))
continue;
get_nfs_open_context(ctx);
@@ -177,7 +195,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
{
int res = 0;
- res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+ if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ res = nfs4_proc_delegreturn(inode,
+ delegation->cred,
+ &delegation->stateid,
+ issync);
nfs_free_delegation(delegation);
return res;
}
@@ -364,11 +386,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
{
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
struct nfs_inode *nfsi = NFS_I(inode);
- int err;
+ int err = 0;
if (delegation == NULL)
return 0;
do {
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ break;
err = nfs_delegation_claim_opens(inode, &delegation->stateid);
if (!issync || err != -EAGAIN)
break;
@@ -589,10 +613,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
rcu_read_unlock();
}
+static void nfs_revoke_delegation(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL) {
+ set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+ nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
+ }
+ rcu_read_unlock();
+}
+
void nfs_remove_bad_delegation(struct inode *inode)
{
struct nfs_delegation *delegation;
+ nfs_revoke_delegation(inode);
delegation = nfs_inode_detach_delegation(inode);
if (delegation) {
nfs_inode_find_state_and_recover(inode, &delegation->stateid);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9a79c7a99d6..e3c20a3ccc9 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -31,6 +31,7 @@ enum {
NFS_DELEGATION_RETURN_IF_CLOSED,
NFS_DELEGATION_REFERENCED,
NFS_DELEGATION_RETURNING,
+ NFS_DELEGATION_REVOKED,
};
int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
@@ -59,6 +60,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t flags);
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a3d4ef7612..6e62155abf2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -486,8 +486,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
goto out;
} else {
- if (d_invalidate(dentry) != 0)
- goto out;
+ d_invalidate(dentry);
dput(dentry);
}
}
@@ -988,9 +987,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
* A check for whether or not the parent directory has changed.
* In the case it has, we assume that the dentries are untrustworthy
* and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
*/
-static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+ int rcu_walk)
{
+ int ret;
+
if (IS_ROOT(dentry))
return 1;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -998,7 +1001,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
/* Revalidate nfsi->cache_change_attribute before we declare a match */
- if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+ if (rcu_walk)
+ ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
+ else
+ ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
+ if (ret < 0)
return 0;
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
@@ -1042,6 +1049,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
out:
return (inode->i_nlink == 0) ? -ENOENT : 0;
out_force:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
ret = __nfs_revalidate_inode(server, inode);
if (ret != 0)
return ret;
@@ -1054,6 +1063,9 @@ out_force:
*
* If parent mtime has changed, we revalidate, else we wait for a
* period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
*/
static inline
int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
@@ -1064,7 +1076,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
return 0;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
return 1;
- return !nfs_check_verifier(dir, dentry);
+ return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
}
/*
@@ -1088,21 +1100,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct nfs4_label *label = NULL;
int error;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- parent = dget_parent(dentry);
- dir = parent->d_inode;
+ if (flags & LOOKUP_RCU) {
+ parent = ACCESS_ONCE(dentry->d_parent);
+ dir = ACCESS_ONCE(parent->d_inode);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+ }
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = dentry->d_inode;
if (!inode) {
- if (nfs_neg_need_reval(dir, dentry, flags))
+ if (nfs_neg_need_reval(dir, dentry, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
goto out_bad;
+ }
goto out_valid_noent;
}
if (is_bad_inode(inode)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
__func__, dentry);
goto out_bad;
@@ -1112,12 +1133,20 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
goto out_set_verifier;
/* Force a full look up iff the parent directory has changed */
- if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
- if (nfs_lookup_verify_inode(inode, flags))
+ if (!nfs_is_exclusive_create(dir, flags) &&
+ nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+
+ if (nfs_lookup_verify_inode(inode, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
goto out_zap_parent;
+ }
goto out_valid;
}
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
if (NFS_STALE(inode))
goto out_bad;
@@ -1153,13 +1182,18 @@ out_set_verifier:
/* Success: notify readdir to use READDIRPLUS */
nfs_advise_use_readdirplus(dir);
out_valid_noent:
- dput(parent);
+ if (flags & LOOKUP_RCU) {
+ if (parent != ACCESS_ONCE(dentry->d_parent))
+ return -ECHILD;
+ } else
+ dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
__func__, dentry);
return 1;
out_zap_parent:
nfs_zap_caches(dir);
out_bad:
+ WARN_ON(flags & LOOKUP_RCU);
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
@@ -1176,15 +1210,12 @@ out_zap_parent:
if (IS_ROOT(dentry))
goto out_valid;
}
- /* If we have submounts, don't unhash ! */
- if (check_submounts_and_drop(dentry) != 0)
- goto out_valid;
-
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
__func__, dentry);
return 0;
out_error:
+ WARN_ON(flags & LOOKUP_RCU);
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
@@ -1496,6 +1527,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
case -ENOENT:
d_drop(dentry);
d_add(dentry, NULL);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
break;
case -EISDIR:
case -ENOTDIR:
@@ -1529,14 +1561,9 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open);
static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct dentry *parent = NULL;
struct inode *inode;
- struct inode *dir;
int ret = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
goto no_open;
if (d_mountpoint(dentry))
@@ -1545,34 +1572,47 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
goto no_open;
inode = dentry->d_inode;
- parent = dget_parent(dentry);
- dir = parent->d_inode;
/* We can't create new files in nfs_open_revalidate(), so we
* optimize away revalidation of negative dentries.
*/
if (inode == NULL) {
+ struct dentry *parent;
+ struct inode *dir;
+
+ if (flags & LOOKUP_RCU) {
+ parent = ACCESS_ONCE(dentry->d_parent);
+ dir = ACCESS_ONCE(parent->d_inode);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+ }
if (!nfs_neg_need_reval(dir, dentry, flags))
ret = 1;
+ else if (flags & LOOKUP_RCU)
+ ret = -ECHILD;
+ if (!(flags & LOOKUP_RCU))
+ dput(parent);
+ else if (parent != ACCESS_ONCE(dentry->d_parent))
+ return -ECHILD;
goto out;
}
/* NFS only supports OPEN on regular files */
if (!S_ISREG(inode->i_mode))
- goto no_open_dput;
+ goto no_open;
/* We cannot do exclusive creation on a positive dentry */
if (flags & LOOKUP_EXCL)
- goto no_open_dput;
+ goto no_open;
/* Let f_op->open() actually open (and revalidate) the file */
ret = 1;
out:
- dput(parent);
return ret;
-no_open_dput:
- dput(parent);
no_open:
return nfs_lookup_revalidate(dentry, flags);
}
@@ -2028,10 +2068,14 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
static LIST_HEAD(nfs_access_lru_list);
static atomic_long_t nfs_access_nr_entries;
+static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+module_param(nfs_access_max_cachesize, ulong, 0644);
+MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
+
static void nfs_access_free_entry(struct nfs_access_entry *entry)
{
put_rpccred(entry->cred);
- kfree(entry);
+ kfree_rcu(entry, rcu_head);
smp_mb__before_atomic();
atomic_long_dec(&nfs_access_nr_entries);
smp_mb__after_atomic();
@@ -2048,19 +2092,14 @@ static void nfs_access_free_list(struct list_head *head)
}
}
-unsigned long
-nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
{
LIST_HEAD(head);
struct nfs_inode *nfsi, *next;
struct nfs_access_entry *cache;
- int nr_to_scan = sc->nr_to_scan;
- gfp_t gfp_mask = sc->gfp_mask;
long freed = 0;
- if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
- return SHRINK_STOP;
-
spin_lock(&nfs_access_lru_lock);
list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
struct inode *inode;
@@ -2094,11 +2133,39 @@ remove_lru_entry:
}
unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
+ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+ return SHRINK_STOP;
+ return nfs_do_access_cache_scan(nr_to_scan);
+}
+
+
+unsigned long
nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
}
+static void
+nfs_access_cache_enforce_limit(void)
+{
+ long nr_entries = atomic_long_read(&nfs_access_nr_entries);
+ unsigned long diff;
+ unsigned int nr_to_scan;
+
+ if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
+ return;
+ nr_to_scan = 100;
+ diff = nr_entries - nfs_access_max_cachesize;
+ if (diff < nr_to_scan)
+ nr_to_scan = diff;
+ nfs_do_access_cache_scan(nr_to_scan);
+}
+
static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
{
struct rb_root *root_node = &nfsi->access_cache;
@@ -2186,6 +2253,38 @@ out_zap:
return -ENOENT;
}
+static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+ /* Only check the most recently returned cache entry,
+ * but do it without locking.
+ */
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_access_entry *cache;
+ int err = -ECHILD;
+ struct list_head *lh;
+
+ rcu_read_lock();
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out;
+ lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+ cache = list_entry(lh, struct nfs_access_entry, lru);
+ if (lh == &nfsi->access_cache_entry_lru ||
+ cred != cache->cred)
+ cache = NULL;
+ if (cache == NULL)
+ goto out;
+ if (!nfs_have_delegated_attributes(inode) &&
+ !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+ goto out;
+ res->jiffies = cache->jiffies;
+ res->cred = cache->cred;
+ res->mask = cache->mask;
+ err = 0;
+out:
+ rcu_read_unlock();
+ return err;
+}
+
static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -2229,6 +2328,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
cache->cred = get_rpccred(set->cred);
cache->mask = set->mask;
+ /* The above field assignments must be visible
+ * before this item appears on the lru. We cannot easily
+ * use rcu_assign_pointer, so just force the memory barrier.
+ */
+ smp_wmb();
nfs_access_add_rbtree(inode, cache);
/* Update accounting */
@@ -2244,6 +2348,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
&nfs_access_lru_list);
spin_unlock(&nfs_access_lru_lock);
}
+ nfs_access_cache_enforce_limit();
}
EXPORT_SYMBOL_GPL(nfs_access_add_cache);
@@ -2267,10 +2372,16 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
trace_nfs_access_enter(inode);
- status = nfs_access_get_cached(inode, cred, &cache);
+ status = nfs_access_get_cached_rcu(inode, cred, &cache);
+ if (status != 0)
+ status = nfs_access_get_cached(inode, cred, &cache);
if (status == 0)
goto out_cached;
+ status = -ECHILD;
+ if (mask & MAY_NOT_BLOCK)
+ goto out;
+
/* Be clever: ask server to check for all possible rights */
cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
cache.cred = cred;
@@ -2321,9 +2432,6 @@ int nfs_permission(struct inode *inode, int mask)
struct rpc_cred *cred;
int res = 0;
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
-
nfs_inc_stats(inode, NFSIOS_VFSACCESS);
if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2350,12 +2458,23 @@ force_lookup:
if (!NFS_PROTO(inode)->access)
goto out_notsup;
- cred = rpc_lookup_cred();
- if (!IS_ERR(cred)) {
- res = nfs_do_access(inode, cred, mask);
- put_rpccred(cred);
- } else
+ /* Always try fast lookups first */
+ rcu_read_lock();
+ cred = rpc_lookup_cred_nonblock();
+ if (!IS_ERR(cred))
+ res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
+ else
res = PTR_ERR(cred);
+ rcu_read_unlock();
+ if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
+ /* Fast lookup failed, try the slow way */
+ cred = rpc_lookup_cred();
+ if (!IS_ERR(cred)) {
+ res = nfs_do_access(inode, cred, mask);
+ put_rpccred(cred);
+ } else
+ res = PTR_ERR(cred);
+ }
out:
if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
res = -EACCES;
@@ -2364,6 +2483,9 @@ out:
inode->i_sb->s_id, inode->i_ino, mask, res);
return res;
out_notsup:
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (res == 0)
res = generic_permission(inode, mask);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f11b9eed0de..10bf07280f4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -148,8 +148,8 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
{
struct nfs_writeverf *verfp;
- verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
- hdr->data->ds_idx);
+ verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+ hdr->ds_idx);
WARN_ON_ONCE(verfp->committed >= 0);
memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +169,8 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
{
struct nfs_writeverf *verfp;
- verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
- hdr->data->ds_idx);
+ verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+ hdr->ds_idx);
if (verfp->committed < 0) {
nfs_direct_set_hdr_verf(dreq, hdr);
return 0;
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
/*
* nfs_direct_cmp_commit_data_verf - compare verifier for commit data
* @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
WARN_ON_ONCE(verfp->committed < 0);
return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
}
-#endif
/**
* nfs_direct_IO - NFS address space operation for direct I/O
@@ -222,11 +220,9 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
#else
VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
- if (rw == READ || rw == KERNEL_READ)
- return nfs_file_direct_read(iocb, iter, pos,
- rw == READ ? true : false);
- return nfs_file_direct_write(iocb, iter, pos,
- rw == WRITE ? true : false);
+ if (rw == READ)
+ return nfs_file_direct_read(iocb, iter, pos);
+ return nfs_file_direct_write(iocb, iter, pos);
#endif /* CONFIG_NFS_SWAP */
}
@@ -270,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref)
{
struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+ nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
if (dreq->l_ctx != NULL)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
@@ -512,7 +509,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* cache.
*/
ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos, bool uio)
+ loff_t pos)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -576,7 +573,6 @@ out:
return result;
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor desc;
@@ -700,22 +696,11 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
}
-#else
-static void nfs_direct_write_schedule_work(struct work_struct *work)
-{
-}
-
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
-{
- nfs_direct_complete(dreq, true);
-}
-#endif
-
static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
struct nfs_commit_info cinfo;
- int bit = -1;
+ bool request_commit = false;
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
@@ -729,27 +714,20 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
dreq->flags = 0;
dreq->error = hdr->error;
}
- if (dreq->error != 0)
- bit = NFS_IOHDR_ERROR;
- else {
+ if (dreq->error == 0) {
dreq->count += hdr->good_bytes;
- if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- bit = NFS_IOHDR_NEED_RESCHED;
- } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+ if (nfs_write_need_commit(hdr)) {
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
- bit = NFS_IOHDR_NEED_RESCHED;
+ request_commit = true;
else if (dreq->flags == 0) {
nfs_direct_set_hdr_verf(dreq, hdr);
- bit = NFS_IOHDR_NEED_COMMIT;
+ request_commit = true;
dreq->flags = NFS_ODIRECT_DO_COMMIT;
} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
- if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+ request_commit = true;
+ if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
dreq->flags =
NFS_ODIRECT_RESCHED_WRITES;
- bit = NFS_IOHDR_NEED_RESCHED;
- } else
- bit = NFS_IOHDR_NEED_COMMIT;
}
}
}
@@ -759,9 +737,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
req = nfs_list_entry(hdr->pages.next);
nfs_list_remove_request(req);
- switch (bit) {
- case NFS_IOHDR_NEED_RESCHED:
- case NFS_IOHDR_NEED_COMMIT:
+ if (request_commit) {
kref_get(&req->wb_kref);
nfs_mark_request_commit(req, hdr->lseg, &cinfo);
}
@@ -902,7 +878,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* is no atomic O_APPEND write facility in the NFS protocol.
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos, bool uio)
+ loff_t pos)
{
ssize_t result = -EINVAL;
struct file *file = iocb->ki_filp;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 524dd80d189..2ab6f00dba5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
+#include "pnfs.h"
#include "nfstrace.h"
@@ -171,7 +172,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
ssize_t result;
if (iocb->ki_filp->f_flags & O_DIRECT)
- return nfs_file_direct_read(iocb, to, iocb->ki_pos, true);
+ return nfs_file_direct_read(iocb, to, iocb->ki_pos);
dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
unsigned int end = offset + len;
+ if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
+ if (!PageUptodate(page))
+ return 1;
+ return 0;
+ }
+
if ((file->f_mode & FMODE_READ) && /* open for read? */
!PageUptodate(page) && /* Uptodate? */
!PagePrivate(page) && /* i/o request already? */
@@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
- /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
- * doing this memory reclaim for a fs-related allocation.
+ /* Always try to initiate a 'commit' if relevant, but only
+ * wait for it if __GFP_WAIT is set. Even then, only wait 1
+ * second and only if the 'bdi' is not congested.
+ * Waiting indefinitely can cause deadlocks when the NFS
+ * server is on this machine, when a new TCP connection is
+ * needed and in other rare cases. There is no particular
+ * need to wait extensively here. A short wait has the
+ * benefit that someone else can worry about the freezer.
*/
- if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
- !(current->flags & PF_FSTRANS)) {
- int how = FLUSH_SYNC;
-
- /* Don't let kswapd deadlock waiting for OOM RPC calls */
- if (current_is_kswapd())
- how = 0;
- nfs_commit_inode(mapping->host, how);
+ if (mapping) {
+ struct nfs_server *nfss = NFS_SERVER(mapping->host);
+ nfs_commit_inode(mapping->host, 0);
+ if ((gfp & __GFP_WAIT) &&
+ !bdi_write_congested(&nfss->backing_dev_info)) {
+ wait_on_page_bit_killable_timeout(page, PG_private,
+ HZ);
+ if (PagePrivate(page))
+ set_bdi_congested(&nfss->backing_dev_info,
+ BLK_RW_ASYNC);
+ }
}
/* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page))
@@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page)
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
+ int ret;
+ struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
*span = sis->pages;
- return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+
+ rcu_read_lock();
+ ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
+ rcu_read_unlock();
+
+ return ret;
}
static void nfs_swap_deactivate(struct file *file)
{
- xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+ struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
+ rcu_read_lock();
+ xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
+ rcu_read_unlock();
}
#endif
@@ -648,7 +676,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
return result;
if (file->f_flags & O_DIRECT)
- return nfs_file_direct_write(iocb, from, pos, true);
+ return nfs_file_direct_write(iocb, from, pos);
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
file, count, (long long) pos);
@@ -891,17 +919,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
}
EXPORT_SYMBOL_GPL(nfs_flock);
-/*
- * There is no protocol support for leases, so we have no way to implement
- * them correctly in the face of opens by other clients.
- */
-int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
-{
- dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg);
- return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(nfs_setlease);
-
const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
.read = new_sync_read,
@@ -918,6 +935,6 @@ const struct file_operations nfs_file_operations = {
.splice_read = nfs_file_splice_read,
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
- .setlease = nfs_setlease,
+ .setlease = simple_nosetlease,
};
EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d2eba1c13b7..7afb52f6a25 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -84,45 +84,37 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
BUG();
}
-static void filelayout_reset_write(struct nfs_pgio_data *data)
+static void filelayout_reset_write(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
- struct rpc_task *task = &data->task;
+ struct rpc_task *task = &hdr->task;
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
dprintk("%s Reset task %5u for i/o through MDS "
"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
- data->task.tk_pid,
+ hdr->task.tk_pid,
hdr->inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(hdr->inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
- task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
- &hdr->pages,
- hdr->completion_ops,
- hdr->dreq);
+ task->tk_status = pnfs_write_done_resend_to_mds(hdr);
}
}
-static void filelayout_reset_read(struct nfs_pgio_data *data)
+static void filelayout_reset_read(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
- struct rpc_task *task = &data->task;
+ struct rpc_task *task = &hdr->task;
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
dprintk("%s Reset task %5u for i/o through MDS "
"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
- data->task.tk_pid,
+ hdr->task.tk_pid,
hdr->inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(hdr->inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
- task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
- &hdr->pages,
- hdr->completion_ops,
- hdr->dreq);
+ task->tk_status = pnfs_read_done_resend_to_mds(hdr);
}
}
@@ -153,9 +145,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (state == NULL)
- break;
- nfs_remove_bad_delegation(state->inode);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
@@ -243,18 +232,17 @@ wait_on_recovery:
/* NFS_PROTO call done callback routines */
static int filelayout_read_done_cb(struct rpc_task *task,
- struct nfs_pgio_data *data)
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
int err;
- trace_nfs4_pnfs_read(data, task->tk_status);
- err = filelayout_async_handle_error(task, data->args.context->state,
- data->ds_clp, hdr->lseg);
+ trace_nfs4_pnfs_read(hdr, task->tk_status);
+ err = filelayout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg);
switch (err) {
case -NFS4ERR_RESET_TO_MDS:
- filelayout_reset_read(data);
+ filelayout_reset_read(hdr);
return task->tk_status;
case -EAGAIN:
rpc_restart_call_prepare(task);
@@ -270,15 +258,14 @@ static int filelayout_read_done_cb(struct rpc_task *task,
* rfc5661 is not clear about which credential should be used.
*/
static void
-filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
+filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = wdata->header;
if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
- wdata->res.verf->committed == NFS_FILE_SYNC)
+ hdr->res.verf->committed != NFS_DATA_SYNC)
return;
- pnfs_set_layoutcommit(wdata);
+ pnfs_set_layoutcommit(hdr);
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
}
@@ -305,83 +292,82 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
*/
static void filelayout_read_prepare(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *rdata = data;
+ struct nfs_pgio_header *hdr = data;
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return;
}
- if (filelayout_reset_to_mds(rdata->header->lseg)) {
+ if (filelayout_reset_to_mds(hdr->lseg)) {
dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
- filelayout_reset_read(rdata);
+ filelayout_reset_read(hdr);
rpc_exit(task, 0);
return;
}
- rdata->pgio_done_cb = filelayout_read_done_cb;
+ hdr->pgio_done_cb = filelayout_read_done_cb;
- if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
- &rdata->args.seq_args,
- &rdata->res.seq_res,
+ if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
task))
return;
- if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
- rdata->args.lock_context, FMODE_READ) == -EIO)
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_READ) == -EIO)
rpc_exit(task, -EIO); /* lost lock, terminate I/O */
}
static void filelayout_read_call_done(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *rdata = data;
+ struct nfs_pgio_header *hdr = data;
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
- if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
- nfs41_sequence_done(task, &rdata->res.seq_res);
+ nfs41_sequence_done(task, &hdr->res.seq_res);
return;
}
/* Note this may cause RPC to be resent */
- rdata->header->mds_ops->rpc_call_done(task, data);
+ hdr->mds_ops->rpc_call_done(task, data);
}
static void filelayout_read_count_stats(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *rdata = data;
+ struct nfs_pgio_header *hdr = data;
- rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
+ rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
}
static void filelayout_read_release(void *data)
{
- struct nfs_pgio_data *rdata = data;
- struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
+ struct nfs_pgio_header *hdr = data;
+ struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
filelayout_fenceme(lo->plh_inode, lo);
- nfs_put_client(rdata->ds_clp);
- rdata->header->mds_ops->rpc_release(data);
+ nfs_put_client(hdr->ds_clp);
+ hdr->mds_ops->rpc_release(data);
}
static int filelayout_write_done_cb(struct rpc_task *task,
- struct nfs_pgio_data *data)
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
int err;
- trace_nfs4_pnfs_write(data, task->tk_status);
- err = filelayout_async_handle_error(task, data->args.context->state,
- data->ds_clp, hdr->lseg);
+ trace_nfs4_pnfs_write(hdr, task->tk_status);
+ err = filelayout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg);
switch (err) {
case -NFS4ERR_RESET_TO_MDS:
- filelayout_reset_write(data);
+ filelayout_reset_write(hdr);
return task->tk_status;
case -EAGAIN:
rpc_restart_call_prepare(task);
return -EAGAIN;
}
- filelayout_set_layoutcommit(data);
+ filelayout_set_layoutcommit(hdr);
return 0;
}
@@ -414,62 +400,65 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
+ if (data->verf.committed == NFS_UNSTABLE)
+ pnfs_commit_set_layoutcommit(data);
+
return 0;
}
static void filelayout_write_prepare(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *wdata = data;
+ struct nfs_pgio_header *hdr = data;
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return;
}
- if (filelayout_reset_to_mds(wdata->header->lseg)) {
+ if (filelayout_reset_to_mds(hdr->lseg)) {
dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
- filelayout_reset_write(wdata);
+ filelayout_reset_write(hdr);
rpc_exit(task, 0);
return;
}
- if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
- &wdata->args.seq_args,
- &wdata->res.seq_res,
+ if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
task))
return;
- if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
- wdata->args.lock_context, FMODE_WRITE) == -EIO)
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_WRITE) == -EIO)
rpc_exit(task, -EIO); /* lost lock, terminate I/O */
}
static void filelayout_write_call_done(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *wdata = data;
+ struct nfs_pgio_header *hdr = data;
- if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
- nfs41_sequence_done(task, &wdata->res.seq_res);
+ nfs41_sequence_done(task, &hdr->res.seq_res);
return;
}
/* Note this may cause RPC to be resent */
- wdata->header->mds_ops->rpc_call_done(task, data);
+ hdr->mds_ops->rpc_call_done(task, data);
}
static void filelayout_write_count_stats(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *wdata = data;
+ struct nfs_pgio_header *hdr = data;
- rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
+ rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
}
static void filelayout_write_release(void *data)
{
- struct nfs_pgio_data *wdata = data;
- struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
+ struct nfs_pgio_header *hdr = data;
+ struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
filelayout_fenceme(lo->plh_inode, lo);
- nfs_put_client(wdata->ds_clp);
- wdata->header->mds_ops->rpc_release(data);
+ nfs_put_client(hdr->ds_clp);
+ hdr->mds_ops->rpc_release(data);
}
static void filelayout_commit_prepare(struct rpc_task *task, void *data)
@@ -529,19 +518,18 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
};
static enum pnfs_try_status
-filelayout_read_pagelist(struct nfs_pgio_data *data)
+filelayout_read_pagelist(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
- loff_t offset = data->args.offset;
+ loff_t offset = hdr->args.offset;
u32 j, idx;
struct nfs_fh *fh;
dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
__func__, hdr->inode->i_ino,
- data->args.pgbase, (size_t)data->args.count, offset);
+ hdr->args.pgbase, (size_t)hdr->args.count, offset);
/* Retrieve the correct rpc_client for the byte range */
j = nfs4_fl_calc_j_index(lseg, offset);
@@ -559,30 +547,29 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
/* No multipath support. Use first DS */
atomic_inc(&ds->ds_clp->cl_count);
- data->ds_clp = ds->ds_clp;
- data->ds_idx = idx;
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
- data->args.fh = fh;
+ hdr->args.fh = fh;
- data->args.offset = filelayout_get_dserver_offset(lseg, offset);
- data->mds_offset = offset;
+ hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ hdr->mds_offset = offset;
/* Perform an asynchronous read to ds */
- nfs_initiate_pgio(ds_clnt, data,
+ nfs_initiate_pgio(ds_clnt, hdr,
&filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
}
/* Perform async writes. */
static enum pnfs_try_status
-filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
+filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
{
- struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
- loff_t offset = data->args.offset;
+ loff_t offset = hdr->args.offset;
u32 j, idx;
struct nfs_fh *fh;
@@ -598,21 +585,20 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
return PNFS_NOT_ATTEMPTED;
dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
- __func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
+ __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
- data->pgio_done_cb = filelayout_write_done_cb;
+ hdr->pgio_done_cb = filelayout_write_done_cb;
atomic_inc(&ds->ds_clp->cl_count);
- data->ds_clp = ds->ds_clp;
- data->ds_idx = idx;
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
- data->args.fh = fh;
-
- data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ hdr->args.fh = fh;
+ hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
/* Perform an asynchronous write */
- nfs_initiate_pgio(ds_clnt, data,
+ nfs_initiate_pgio(ds_clnt, hdr,
&filelayout_write_call_ops, sync,
RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
@@ -660,18 +646,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
}
/* find and reference the deviceid */
- d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
- NFS_SERVER(lo->plh_inode)->nfs_client, id);
- if (d == NULL) {
- dsaddr = filelayout_get_device_info(lo->plh_inode, id,
- lo->plh_lc_cred, gfp_flags);
- if (dsaddr == NULL)
- goto out;
- } else
- dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
+ lo->plh_lc_cred, gfp_flags);
+ if (d == NULL)
+ goto out;
+
+ dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
/* Found deviceid is unavailable */
if (filelayout_test_devid_unavailable(&dsaddr->id_node))
- goto out_put;
+ goto out_put;
fl->dsaddr = dsaddr;
@@ -1023,6 +1006,7 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
/* The generic layer is about to remove the req from the commit list.
* If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this is must be called holding the inode (/cinfo) lock
*/
static void
filelayout_clear_request_commit(struct nfs_page *req,
@@ -1030,7 +1014,6 @@ filelayout_clear_request_commit(struct nfs_page *req,
{
struct pnfs_layout_segment *freeme = NULL;
- spin_lock(cinfo->lock);
if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
goto out;
cinfo->ds->nwritten--;
@@ -1045,22 +1028,25 @@ filelayout_clear_request_commit(struct nfs_page *req,
}
out:
nfs_request_remove_commit_list(req, cinfo);
- spin_unlock(cinfo->lock);
- pnfs_put_lseg(freeme);
+ pnfs_put_lseg_locked(freeme);
}
-static struct list_head *
-filelayout_choose_commit_list(struct nfs_page *req,
- struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo)
+
{
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
u32 i, j;
struct list_head *list;
struct pnfs_commit_bucket *buckets;
- if (fl->commit_through_mds)
- return &cinfo->mds->list;
+ if (fl->commit_through_mds) {
+ list = &cinfo->mds->list;
+ spin_lock(cinfo->lock);
+ goto mds_commit;
+ }
/* Note that we are calling nfs4_fl_calc_j_index on each page
* that ends up being committed to a data server. An attractive
@@ -1084,19 +1070,22 @@ filelayout_choose_commit_list(struct nfs_page *req,
}
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
- spin_unlock(cinfo->lock);
- return list;
-}
-
-static void
-filelayout_mark_request_commit(struct nfs_page *req,
- struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
-{
- struct list_head *list;
- list = filelayout_choose_commit_list(req, lseg, cinfo);
- nfs_request_add_commit_list(req, list, cinfo);
+mds_commit:
+ /* nfs_request_add_commit_list(). We need to add req to list without
+ * dropping cinfo lock.
+ */
+ set_bit(PG_CLEAN, &(req)->wb_flags);
+ nfs_list_add_request(req, list);
+ cinfo->mds->ncommit++;
+ spin_unlock(cinfo->lock);
+ if (!cinfo->dreq) {
+ inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ BDI_RECLAIMABLE);
+ __mark_inode_dirty(req->wb_context->dentry->d_inode,
+ I_DIRTY_DATASYNC);
+ }
}
static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -1244,15 +1233,64 @@ restart:
spin_unlock(cinfo->lock);
}
+/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
+ * for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+static struct nfs_page *
+filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
+{
+ struct nfs_page *freq, *t;
+ struct pnfs_commit_bucket *b;
+ int i;
+
+ /* Linearly search the commit lists for each bucket until a matching
+ * request is found */
+ for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+ list_for_each_entry_safe(freq, t, &b->written, wb_list) {
+ if (freq->wb_page == page)
+ return freq->wb_head;
+ }
+ list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
+ if (freq->wb_page == page)
+ return freq->wb_head;
+ }
+ }
+
+ return NULL;
+}
+
+static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_bucket *bucket;
+ struct pnfs_layout_segment *freeme;
+ int i;
+
+ for (i = idx; i < fl_cinfo->nbuckets; i++) {
+ bucket = &fl_cinfo->buckets[i];
+ if (list_empty(&bucket->committing))
+ continue;
+ nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
+ spin_lock(cinfo->lock);
+ freeme = bucket->clseg;
+ bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
+ pnfs_put_lseg(freeme);
+ }
+}
+
static unsigned int
alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
{
struct pnfs_ds_commit_info *fl_cinfo;
struct pnfs_commit_bucket *bucket;
struct nfs_commit_data *data;
- int i, j;
+ int i;
unsigned int nreq = 0;
- struct pnfs_layout_segment *freeme;
fl_cinfo = cinfo->ds;
bucket = fl_cinfo->buckets;
@@ -1272,16 +1310,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
}
/* Clean up on error */
- for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
- if (list_empty(&bucket->committing))
- continue;
- nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
- spin_lock(cinfo->lock);
- freeme = bucket->clseg;
- bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
- pnfs_put_lseg(freeme);
- }
+ filelayout_retry_commit(cinfo, i);
/* Caller will clean up entries put on list */
return nreq;
}
@@ -1301,8 +1330,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
data->lseg = NULL;
list_add(&data->pages, &list);
nreq++;
- } else
+ } else {
nfs_retry_commit(mds_pages, NULL, cinfo);
+ filelayout_retry_commit(cinfo, 0);
+ cinfo->completion_ops->error_cleanup(NFS_I(inode));
+ return -ENOMEM;
+ }
}
nreq += alloc_ds_commits(cinfo, &list);
@@ -1332,6 +1365,17 @@ out:
cinfo->ds->ncommitting = 0;
return PNFS_ATTEMPTED;
}
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr;
+
+ dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}
static void
filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1380,9 +1424,11 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.clear_request_commit = filelayout_clear_request_commit,
.scan_commit_lists = filelayout_scan_commit_lists,
.recover_commit_reqs = filelayout_recover_commit_reqs,
+ .search_commit_reqs = filelayout_search_commit_reqs,
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
+ .alloc_deviceid_node = filelayout_alloc_deviceid_node,
.free_deviceid_node = filelayout_free_deveiceid_node,
};
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219e..7c9f800c49d 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
u32 ds_idx);
+
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags);
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
- struct rpc_cred *cred, gfp_t gfp_flags);
#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index e2a0361e24c..9bb806a76d9 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
}
/* Decode opaque device data and return the result */
-static struct nfs4_file_layout_dsaddr*
-decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
{
int i;
u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
dsaddr->stripe_indices = stripe_indices;
stripe_indices = NULL;
dsaddr->ds_num = num;
- nfs4_init_deviceid_node(&dsaddr->id_node,
- NFS_SERVER(ino)->pnfs_curr_ld,
- NFS_SERVER(ino)->nfs_client,
- &pdev->dev_id);
+ nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
INIT_LIST_HEAD(&dsaddrs);
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
mp_count = be32_to_cpup(p); /* multipath count */
for (j = 0; j < mp_count; j++) {
- da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
+ da = decode_ds_addr(server->nfs_client->cl_net,
&stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
return NULL;
}
-/*
- * Decode the opaque device specified in 'dev' and add it to the cache of
- * available devices.
- */
-static struct nfs4_file_layout_dsaddr *
-decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
-{
- struct nfs4_deviceid_node *d;
- struct nfs4_file_layout_dsaddr *n, *new;
-
- new = decode_device(inode, dev, gfp_flags);
- if (!new) {
- printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
- __func__);
- return NULL;
- }
-
- d = nfs4_insert_deviceid_node(&new->id_node);
- n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
- if (n != new) {
- nfs4_fl_free_deviceid(new);
- return n;
- }
-
- return new;
-}
-
-/*
- * Retrieve the information for dev_id, add it to the list
- * of available devices, and return it.
- */
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode,
- struct nfs4_deviceid *dev_id,
- struct rpc_cred *cred,
- gfp_t gfp_flags)
-{
- struct pnfs_device *pdev = NULL;
- u32 max_resp_sz;
- int max_pages;
- struct page **pages = NULL;
- struct nfs4_file_layout_dsaddr *dsaddr = NULL;
- int rc, i;
- struct nfs_server *server = NFS_SERVER(inode);
-
- /*
- * Use the session max response size as the basis for setting
- * GETDEVICEINFO's maxcount
- */
- max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
- max_pages = nfs_page_array_len(0, max_resp_sz);
- dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
- __func__, inode, max_resp_sz, max_pages);
-
- pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
- if (pdev == NULL)
- return NULL;
-
- pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
- if (pages == NULL) {
- kfree(pdev);
- return NULL;
- }
- for (i = 0; i < max_pages; i++) {
- pages[i] = alloc_page(gfp_flags);
- if (!pages[i])
- goto out_free;
- }
-
- memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
- pdev->layout_type = LAYOUT_NFSV4_1_FILES;
- pdev->pages = pages;
- pdev->pgbase = 0;
- pdev->pglen = max_resp_sz;
- pdev->mincount = 0;
- pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
- rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
- dprintk("%s getdevice info returns %d\n", __func__, rc);
- if (rc)
- goto out_free;
-
- /*
- * Found new device, need to decode it and then add it to the
- * list of known devices for this mountpoint.
- */
- dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
-out_free:
- for (i = 0; i < max_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- kfree(pdev);
- dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
- return dsaddr;
-}
-
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 7cf2c4699b0..777b055063f 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
struct nfs_server_key *key = buffer;
uint16_t len = sizeof(struct nfs_server_key);
+ memset(key, 0, len);
key->nfsversion = clp->rpc_ops->version;
key->family = clp->cl_addr.ss_family;
- memset(key, 0, len);
-
switch (clp->cl_addr.ss_family) {
case AF_INET:
key->port = sin->sin_port;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b94f80420a5..880618a8b04 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
* if the dentry tree reaches them; however if the dentry already
* exists, we'll pick it up at this point and use it as the root
*/
- ret = d_obtain_alias(inode);
+ ret = d_obtain_root(inode);
if (IS_ERR(ret)) {
dprintk("nfs_get_root: get root dentry failed\n");
goto out;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 7dd55b745c4..2f5db844c17 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -177,7 +177,6 @@ static struct key_type key_type_id_resolver = {
.preparse = user_preparse,
.free_preparse = user_free_preparse,
.instantiate = generic_key_instantiate,
- .match = user_match,
.revoke = user_revoke,
.destroy = user_destroy,
.describe = user_describe,
@@ -401,7 +400,6 @@ static struct key_type key_type_id_resolver_legacy = {
.preparse = user_preparse,
.free_preparse = user_free_preparse,
.instantiate = generic_key_instantiate,
- .match = user_match,
.revoke = user_revoke,
.destroy = user_destroy,
.describe = user_describe,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index abd37a38053..00689a8a85e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_MODE;
if (attr->ia_valid & ATTR_SIZE) {
- if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode))
+ BUG_ON(!S_ISREG(inode->i_mode));
+
+ if (attr->ia_size == i_size_read(inode))
attr->ia_valid &= ~ATTR_SIZE;
}
@@ -624,7 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
- int err;
+ int err = 0;
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
@@ -716,6 +718,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
kfree(new);
return res;
}
+EXPORT_SYMBOL_GPL(nfs_get_lock_context);
void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
{
@@ -728,6 +731,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
spin_unlock(&inode->i_lock);
kfree(l_ctx);
}
+EXPORT_SYMBOL_GPL(nfs_put_lock_context);
/**
* nfs_close_context - Common close_context() routine NFSv2/v3
@@ -1002,6 +1006,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
}
EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
+int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
+{
+ if (!(NFS_I(inode)->cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+ && !nfs_attribute_cache_expired(inode))
+ return NFS_STALE(inode) ? -ESTALE : 0;
+ return -ECHILD;
+}
+
static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -1840,11 +1853,12 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
nfs_clients_init(net);
- return 0;
+ return nfs_fs_proc_net_init(net);
}
static void nfs_net_exit(struct net *net)
{
+ nfs_fs_proc_net_exit(net);
nfs_cleanup_cb_ident_idr(net);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 617f36611d4..efaa31c70fb 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -195,7 +195,16 @@ extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
+extern int nfs_fs_proc_net_init(struct net *net);
+extern void nfs_fs_proc_net_exit(struct net *net);
#else
+static inline int nfs_fs_proc_net_init(struct net *net)
+{
+ return 0;
+}
+static inline void nfs_fs_proc_net_exit(struct net *net)
+{
+}
static inline int nfs_fs_proc_init(void)
{
return 0;
@@ -209,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
#endif
-/* nfs3client.c */
-#if IS_ENABLED(CONFIG_NFS_V3)
-struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
-struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
- struct nfs_fattr *, rpc_authflavor_t);
-#endif
-
/* callback_xdr.c */
extern struct svc_version nfs4_callback_version1;
extern struct svc_version nfs4_callback_version4;
@@ -238,11 +240,11 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
int nfs_iocounter_wait(struct nfs_io_counter *c);
extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
-void nfs_rw_header_free(struct nfs_pgio_header *);
-void nfs_pgio_data_release(struct nfs_pgio_data *);
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
+void nfs_pgio_header_free(struct nfs_pgio_header *);
+void nfs_pgio_data_destroy(struct nfs_pgio_header *);
int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
const struct rpc_call_ops *, int, int);
void nfs_free_request(struct nfs_page *req);
@@ -337,7 +339,6 @@ int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
int nfs_check_flags(int);
-int nfs_setlease(struct file *, long, struct file_lock **);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
@@ -442,6 +443,7 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
void nfs_mark_request_commit(struct nfs_page *req,
struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo);
+int nfs_write_need_commit(struct nfs_pgio_header *);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
@@ -482,7 +484,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
+extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
const char *ip_addr);
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 8ee1fab8326..f0e06e4acbe 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -19,6 +19,7 @@ struct nfs_net {
struct rpc_pipe *bl_device_pipe;
struct bl_dev_msg bl_mount_reply;
wait_queue_head_t bl_wq;
+ struct mutex bl_mutex;
struct list_head nfs_client_list;
struct list_head nfs_volume_list;
#if IS_ENABLED(CONFIG_NFS_V4)
@@ -29,6 +30,9 @@ struct nfs_net {
#endif
spinlock_t nfs_client_lock;
struct timespec boot_time;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_nfsfs;
+#endif
};
extern int nfs_net_id;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 00000000000..333ae406850
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2014 Anna Schumaker.
+ *
+ * NFSv3-specific filesystem definitions and declarations
+ */
+#ifndef __LINUX_FS_NFS_NFS3_FS_H
+#define __LINUX_FS_NFS_NFS3_FS_H
+
+/*
+ * nfs3acl.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl);
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+extern const struct xattr_handler *nfs3_xattr_handlers[];
+#else
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ return 0;
+}
+#define nfs3_listxattr NULL
+#endif /* CONFIG_NFS_V3_ACL */
+
+/* nfs3client.c */
+struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
+struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
+ struct nfs_fattr *, rpc_authflavor_t);
+
+
+#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 8f854dde415..658e586ca43 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
#include <linux/nfsacl.h>
#include "internal.h"
+#include "nfs3_fs.h"
#define NFSDBG_FACILITY NFSDBG_PROC
@@ -129,7 +130,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
.rpc_argp = &args,
.rpc_resp = &fattr,
};
- int status;
+ int status = 0;
+
+ if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
+ goto out;
status = -EOPNOTSUPP;
if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -256,7 +260,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
char *p = data + *result;
acl = get_acl(inode, type);
- if (!acl)
+ if (IS_ERR_OR_NULL(acl))
return 0;
posix_acl_release(acl);
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39c..8c1b437c540 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
#include "internal.h"
+#include "nfs3_fs.h"
#ifdef CONFIG_NFS_V3_ACL
static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f0afa291fd5..524f9f83740 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
#include "iostat.h"
#include "internal.h"
+#include "nfs3_fs.h"
#define NFSDBG_FACILITY NFSDBG_PROC
@@ -795,41 +796,44 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
-static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
nfs_invalidate_atime(inode);
- nfs_refresh_inode(inode, &data->fattr);
+ nfs_refresh_inode(inode, &hdr->fattr);
return 0;
}
-static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
}
-static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
rpc_call_start(task);
return 0;
}
-static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
if (task->tk_status >= 0)
- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+ nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
return 0;
}
-static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
}
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af1..6af29c2da35 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
#include <linux/module.h>
#include <linux/nfs_fs.h>
#include "internal.h"
+#include "nfs3_fs.h"
#include "nfs.h"
static struct nfs_subversion nfs_v3 = {
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
new file mode 100644
index 00000000000..d10333a197b
--- /dev/null
+++ b/fs/nfs/nfs42.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_2_H
+#define __LINUX_FS_NFS_NFS4_2_H
+
+/* nfs4.2proc.c */
+loff_t nfs42_proc_llseek(struct file *, loff_t, int);
+
+/* nfs4.2xdr.h */
+extern struct rpc_procinfo nfs4_2_procedures[];
+
+#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
new file mode 100644
index 00000000000..0886f1db591
--- /dev/null
+++ b/fs/nfs/nfs42proc.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#include <linux/fs.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_xdr.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "nfs42.h"
+
+static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
+ fmode_t fmode)
+{
+ struct nfs_open_context *open;
+ struct nfs_lock_context *lock;
+ int ret;
+
+ open = get_nfs_open_context(nfs_file_open_context(file));
+ lock = nfs_get_lock_context(open);
+ if (IS_ERR(lock)) {
+ put_nfs_open_context(open);
+ return PTR_ERR(lock);
+ }
+
+ ret = nfs4_set_rw_stateid(dst, open, lock, fmode);
+
+ nfs_put_lock_context(lock);
+ put_nfs_open_context(open);
+ return ret;
+}
+
+loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+{
+ struct inode *inode = file_inode(filep);
+ struct nfs42_seek_args args = {
+ .sa_fh = NFS_FH(inode),
+ .sa_offset = offset,
+ .sa_what = (whence == SEEK_HOLE) ?
+ NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA,
+ };
+ struct nfs42_seek_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct nfs_server *server = NFS_SERVER(inode);
+ int status;
+
+ if (!(server->caps & NFS_CAP_SEEK))
+ return -ENOTSUPP;
+
+ status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
+ if (status)
+ return status;
+
+ nfs_wb_all(inode);
+ status = nfs4_call_sync(server->client, server, &msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == -ENOTSUPP)
+ server->caps &= ~NFS_CAP_SEEK;
+ if (status)
+ return status;
+
+ return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
new file mode 100644
index 00000000000..c90469b604b
--- /dev/null
+++ b/fs/nfs/nfs42xdr.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
+#define __LINUX_FS_NFS_NFS4_2XDR_H
+
+#define encode_seek_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* offset */ + \
+ 1 /* whence */)
+#define decode_seek_maxsz (op_decode_hdr_maxsz + \
+ 1 /* eof */ + \
+ 1 /* whence */ + \
+ 2 /* offset */ + \
+ 2 /* length */)
+
+#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_seek_maxsz)
+#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_seek_maxsz)
+
+
+static void encode_seek(struct xdr_stream *xdr,
+ struct nfs42_seek_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SEEK, decode_seek_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->sa_stateid);
+ encode_uint64(xdr, args->sa_offset);
+ encode_uint32(xdr, args->sa_what);
+}
+
+/*
+ * Encode SEEK request
+ */
+static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_seek_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->sa_fh, &hdr);
+ encode_seek(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
+{
+ int status;
+ __be32 *p;
+
+ status = decode_op_hdr(xdr, OP_SEEK);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4 + 8);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ res->sr_eof = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &res->sr_offset);
+ return 0;
+
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+/*
+ * Decode SEEK request
+ */
+static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_seek_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_seek(xdr, res);
+out:
+ return status;
+}
+#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ba2affa5194..be6cac37ea1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -54,7 +54,7 @@ struct nfs4_minor_version_ops {
const nfs4_stateid *);
int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
struct nfs_fsinfo *);
- int (*free_lock_state)(struct nfs_server *,
+ void (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
const struct rpc_call_ops *call_sync_ops;
const struct nfs4_state_recovery_ops *reboot_recovery_ops;
@@ -129,17 +129,6 @@ enum {
* LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
*/
-struct nfs4_lock_owner {
- unsigned int lo_type;
-#define NFS4_ANY_LOCK_TYPE (0U)
-#define NFS4_FLOCK_LOCK_TYPE (1U << 0)
-#define NFS4_POSIX_LOCK_TYPE (1U << 1)
- union {
- fl_owner_t posix_owner;
- pid_t flock_owner;
- } lo_u;
-};
-
struct nfs4_lock_state {
struct list_head ls_locks; /* Other lock stateids */
struct nfs4_state * ls_state; /* Pointer to open state */
@@ -149,7 +138,7 @@ struct nfs4_lock_state {
struct nfs_seqid_counter ls_seqid;
nfs4_stateid ls_stateid;
atomic_t ls_count;
- struct nfs4_lock_owner ls_owner;
+ fl_owner_t ls_owner;
};
/* bits for nfs4_state->flags */
@@ -237,6 +226,9 @@ int nfs4_replace_transport(struct nfs_server *server,
const struct nfs4_fs_locations *locations);
/* nfs4proc.c */
+extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
+ struct rpc_message *, struct nfs4_sequence_args *,
+ struct nfs4_sequence_res *, int);
extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
@@ -337,11 +329,11 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
*/
static inline void
nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
- struct rpc_message *msg, struct nfs_pgio_data *wdata)
+ struct rpc_message *msg, struct nfs_pgio_header *hdr)
{
if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
!test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
- wdata->args.stable = NFS_FILE_SYNC;
+ hdr->args.stable = NFS_FILE_SYNC;
}
#else /* CONFIG_NFS_v4_1 */
static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -369,7 +361,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
static inline void
nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
- struct rpc_message *msg, struct nfs_pgio_data *wdata)
+ struct rpc_message *msg, struct nfs_pgio_header *hdr)
{
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index aa9ef487604..ffdb28d86cf 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos->rpc_ops != new->rpc_ops)
+ continue;
+
+ if (pos->cl_proto != new->cl_proto)
+ continue;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ continue;
+
/* If "pos" isn't marked ready, we can't trust the
* remaining fields in "pos" */
if (pos->cl_cons_state > NFS_CS_READY) {
@@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (pos->cl_cons_state != NFS_CS_READY)
continue;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_proto != new->cl_proto)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
if (pos->cl_clientid != new->cl_clientid)
continue;
@@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos->rpc_ops != new->rpc_ops)
+ continue;
+
+ if (pos->cl_proto != new->cl_proto)
+ continue;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ continue;
+
/* If "pos" isn't marked ready, we can't trust the
* remaining fields in "pos", especially the client
* ID and serverowner fields. Wait for CREATE_SESSION
@@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (pos->cl_cons_state != NFS_CS_READY)
continue;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_proto != new->cl_proto)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
if (!nfs4_match_clientids(pos, new))
continue;
@@ -855,6 +857,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
};
struct rpc_timeout ds_timeout;
struct nfs_client *clp;
+ char buf[INET6_ADDRSTRLEN + 1];
+
+ if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+ return ERR_PTR(-EINVAL);
+ cl_init.hostname = buf;
/*
* Set an authflavor equual to the MDS value. Use the MDS nfs_client
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index a816f0627a6..c51fb4db9bf 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -8,6 +8,10 @@
#include "fscache.h"
#include "pnfs.h"
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42.h"
+#endif
+
#define NFSDBG_FACILITY NFSDBG_FILE
static int
@@ -115,8 +119,29 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
}
+#ifdef CONFIG_NFS_V4_2
+static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
+{
+ loff_t ret;
+
+ switch (whence) {
+ case SEEK_HOLE:
+ case SEEK_DATA:
+ ret = nfs42_proc_llseek(filep, offset, whence);
+ if (ret != -ENOTSUPP)
+ return ret;
+ default:
+ return nfs_file_llseek(filep, offset, whence);
+ }
+}
+#endif /* CONFIG_NFS_V4_2 */
+
const struct file_operations nfs4_file_operations = {
+#ifdef CONFIG_NFS_V4_2
+ .llseek = nfs4_file_llseek,
+#else
.llseek = nfs_file_llseek,
+#endif
.read = new_sync_read,
.write = new_sync_write,
.read_iter = nfs_file_read,
@@ -131,5 +156,5 @@ const struct file_operations nfs4_file_operations = {
.splice_read = nfs_file_splice_read,
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
- .setlease = nfs_setlease,
+ .setlease = simple_nosetlease,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4bf3d97cc5a..69dc20a743f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start);
}
+static long nfs4_update_delay(long *timeout)
+{
+ long ret;
+ if (!timeout)
+ return NFS4_POLL_RETRY_MAX;
+ if (*timeout <= 0)
+ *timeout = NFS4_POLL_RETRY_MIN;
+ if (*timeout > NFS4_POLL_RETRY_MAX)
+ *timeout = NFS4_POLL_RETRY_MAX;
+ ret = *timeout;
+ *timeout <<= 1;
+ return ret;
+}
+
static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
{
int res = 0;
might_sleep();
- if (*timeout <= 0)
- *timeout = NFS4_POLL_RETRY_MIN;
- if (*timeout > NFS4_POLL_RETRY_MAX)
- *timeout = NFS4_POLL_RETRY_MAX;
- freezable_schedule_timeout_killable_unsafe(*timeout);
+ freezable_schedule_timeout_killable_unsafe(
+ nfs4_update_delay(timeout));
if (fatal_signal_pending(current))
res = -ERESTARTSYS;
- *timeout <<= 1;
return res;
}
@@ -360,11 +370,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
- nfs_remove_bad_delegation(inode);
- exception->retry = 1;
- break;
- }
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -875,7 +880,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
return ret;
}
-static
int nfs4_call_sync(struct rpc_clnt *clnt,
struct nfs_server *server,
struct rpc_message *msg,
@@ -1307,15 +1311,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
int ret = -EAGAIN;
for (;;) {
+ spin_lock(&state->owner->so_lock);
if (can_open_cached(state, fmode, open_mode)) {
- spin_lock(&state->owner->so_lock);
- if (can_open_cached(state, fmode, open_mode)) {
- update_open_stateflags(state, fmode);
- spin_unlock(&state->owner->so_lock);
- goto out_return_state;
- }
+ update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
+ goto out_return_state;
}
+ spin_unlock(&state->owner->so_lock);
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
if (!can_open_delegated(delegation, fmode)) {
@@ -1647,7 +1649,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
nfs_inode_find_state_and_recover(state->inode,
stateid);
nfs4_schedule_stateid_recovery(server, state);
- return 0;
+ return -EAGAIN;
case -NFS4ERR_DELAY:
case -NFS4ERR_GRACE:
set_bit(NFS_DELEGATED_STATE, &state->flags);
@@ -1952,6 +1954,14 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
return status;
}
+/*
+ * Additional permission checks in order to distinguish between an
+ * open for read, and an open for execute. This works around the
+ * fact that NFSv4 OPEN treats read and execute permissions as being
+ * the same.
+ * Note that in the non-execute case, we want to turn off permission
+ * checking if we just created a new file (POSIX open() semantics).
+ */
static int nfs4_opendata_access(struct rpc_cred *cred,
struct nfs4_opendata *opendata,
struct nfs4_state *state, fmode_t fmode,
@@ -1966,14 +1976,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
return 0;
mask = 0;
- /* don't check MAY_WRITE - a newly created file may not have
- * write mode bits, but POSIX allows the creating process to write.
- * use openflags to check for exec, because fmode won't
- * always have FMODE_EXEC set when file open for exec. */
+ /*
+ * Use openflags to check for exec, because fmode won't
+ * always have FMODE_EXEC set when file open for exec.
+ */
if (openflags & __FMODE_EXEC) {
/* ONLY check for exec rights */
mask = MAY_EXEC;
- } else if (fmode & FMODE_READ)
+ } else if ((fmode & FMODE_READ) && !opendata->file_created)
mask = MAY_READ;
cache.cred = cred;
@@ -2094,46 +2104,60 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
return ret;
}
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+{
+ nfs_remove_bad_delegation(state->inode);
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+}
+
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+ if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+ nfs_finish_clear_delegation_stateid(state);
+}
+
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ /* NFSv4.0 doesn't allow for delegation recovery on open expire */
+ nfs40_clear_delegation_stateid(state);
+ return nfs4_open_expired(sp, state);
+}
+
#if defined(CONFIG_NFS_V4_1)
-static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs41_check_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
- nfs4_stateid *stateid = &state->stateid;
+ nfs4_stateid stateid;
struct nfs_delegation *delegation;
- struct rpc_cred *cred = NULL;
- int status = -NFS4ERR_BAD_STATEID;
-
- /* If a state reset has been done, test_stateid is unneeded */
- if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
- return;
+ struct rpc_cred *cred;
+ int status;
/* Get the delegation credential for use by test/free_stateid */
rcu_read_lock();
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
- if (delegation != NULL &&
- nfs4_stateid_match(&delegation->stateid, stateid)) {
- cred = get_rpccred(delegation->cred);
- rcu_read_unlock();
- status = nfs41_test_stateid(server, stateid, cred);
- trace_nfs4_test_delegation_stateid(state, NULL, status);
- } else
+ if (delegation == NULL) {
rcu_read_unlock();
+ return;
+ }
+
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ cred = get_rpccred(delegation->cred);
+ rcu_read_unlock();
+ status = nfs41_test_stateid(server, &stateid, cred);
+ trace_nfs4_test_delegation_stateid(state, NULL, status);
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid, cred);
- nfs_remove_bad_delegation(state->inode);
-
- write_seqlock(&state->seqlock);
- nfs4_stateid_copy(&state->stateid, &state->open_stateid);
- write_sequnlock(&state->seqlock);
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs41_free_stateid(server, &stateid, cred);
+ nfs_finish_clear_delegation_stateid(state);
}
- if (cred != NULL)
- put_rpccred(cred);
+ put_rpccred(cred);
}
/**
@@ -2177,7 +2201,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
{
int status;
- nfs41_clear_delegation_stateid(state);
+ nfs41_check_delegation_stateid(state);
status = nfs41_check_open_stateid(state);
if (status != NFS_OK)
status = nfs4_open_expired(sp, state);
@@ -2545,6 +2569,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
struct nfs4_closedata *calldata = data;
struct nfs4_state *state = calldata->state;
struct nfs_server *server = NFS_SERVER(calldata->inode);
+ nfs4_stateid *res_stateid = NULL;
dprintk("%s: begin!\n", __func__);
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -2555,12 +2580,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
*/
switch (task->tk_status) {
case 0:
- if (calldata->roc)
+ res_stateid = &calldata->res.stateid;
+ if (calldata->arg.fmode == 0 && calldata->roc)
pnfs_roc_set_barrier(state->inode,
calldata->roc_barrier);
- nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
renew_lease(server, calldata->timestamp);
- goto out_release;
+ break;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_OLD_STATEID:
@@ -2569,12 +2594,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
if (calldata->arg.fmode == 0)
break;
default:
- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
goto out_release;
}
}
- nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
+ nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
out_release:
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2586,6 +2611,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
struct nfs4_closedata *calldata = data;
struct nfs4_state *state = calldata->state;
struct inode *inode = calldata->inode;
+ bool is_rdonly, is_wronly, is_rdwr;
int call_close = 0;
dprintk("%s: begin!\n", __func__);
@@ -2593,21 +2619,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_wait;
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
- calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
spin_lock(&state->owner->so_lock);
+ is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
+ is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
+ is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
/* Calculate the change in open mode */
+ calldata->arg.fmode = 0;
if (state->n_rdwr == 0) {
- if (state->n_rdonly == 0) {
- call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
- call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
- calldata->arg.fmode &= ~FMODE_READ;
- }
- if (state->n_wronly == 0) {
- call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
- call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
- calldata->arg.fmode &= ~FMODE_WRITE;
- }
- }
+ if (state->n_rdonly == 0)
+ call_close |= is_rdonly;
+ else if (is_rdonly)
+ calldata->arg.fmode |= FMODE_READ;
+ if (state->n_wronly == 0)
+ call_close |= is_wronly;
+ else if (is_wronly)
+ calldata->arg.fmode |= FMODE_WRITE;
+ } else if (is_rdwr)
+ calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
+
+ if (calldata->arg.fmode == 0)
+ call_close |= is_rdwr;
+
if (!nfs4_valid_open_stateid(state))
call_close = 0;
spin_unlock(&state->owner->so_lock);
@@ -2647,6 +2679,48 @@ static const struct rpc_call_ops nfs4_close_ops = {
.rpc_release = nfs4_free_closedata,
};
+static bool nfs4_state_has_opener(struct nfs4_state *state)
+{
+ /* first check existing openers */
+ if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
+ state->n_rdonly != 0)
+ return true;
+
+ if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
+ state->n_wronly != 0)
+ return true;
+
+ if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
+ state->n_rdwr != 0)
+ return true;
+
+ return false;
+}
+
+static bool nfs4_roc(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ if (nfs4_state_has_opener(state)) {
+ spin_unlock(&inode->i_lock);
+ return false;
+ }
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (nfs4_check_delegation(inode, FMODE_READ))
+ return false;
+
+ return pnfs_roc(inode);
+}
+
/*
* It is possible for data to be read/written from a mem-mapped file
* after the sys_close call (which hits the vfs layer as a flush).
@@ -2697,7 +2771,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
calldata->res.fattr = &calldata->fattr;
calldata->res.seqid = calldata->arg.seqid;
calldata->res.server = server;
- calldata->roc = pnfs_roc(state->inode);
+ calldata->roc = nfs4_roc(state->inode);
nfs_sb_active(calldata->inode->i_sb);
msg.rpc_argp = &calldata->arg;
@@ -3148,7 +3222,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct nfs4_label *label = NULL;
int status;
- if (pnfs_ld_layoutret_on_setattr(inode))
+ if (pnfs_ld_layoutret_on_setattr(inode) &&
+ sattr->ia_valid & ATTR_SIZE &&
+ sattr->ia_size < i_size_read(inode))
pnfs_commit_and_return_layout(inode);
nfs_fattr_init(fattr);
@@ -3507,7 +3583,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, res->server, NULL,
+ &data->timeout) == -EAGAIN)
return 0;
update_changeattr(dir, &res->cinfo);
return 1;
@@ -3540,7 +3617,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
return 0;
update_changeattr(old_dir, &res->old_cinfo);
@@ -4033,24 +4110,26 @@ static bool nfs4_error_stateid_expired(int err)
return false;
}
-void __nfs4_read_done_cb(struct nfs_pgio_data *data)
+void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
{
- nfs_invalidate_atime(data->header->inode);
+ nfs_invalidate_atime(hdr->inode);
}
-static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct nfs_server *server = NFS_SERVER(data->header->inode);
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
- trace_nfs4_read(data, task->tk_status);
- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
+ trace_nfs4_read(hdr, task->tk_status);
+ if (nfs4_async_handle_error(task, server,
+ hdr->args.context->state,
+ NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
}
- __nfs4_read_done_cb(data);
+ __nfs4_read_done_cb(hdr);
if (task->tk_status > 0)
- renew_lease(server, data->timestamp);
+ renew_lease(server, hdr->timestamp);
return 0;
}
@@ -4068,54 +4147,60 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
dprintk("--> %s\n", __func__);
- if (!nfs4_sequence_done(task, &data->res.seq_res))
+ if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
- if (nfs4_read_stateid_changed(task, &data->args))
+ if (nfs4_read_stateid_changed(task, &hdr->args))
return -EAGAIN;
- return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
- nfs4_read_done_cb(task, data);
+ return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+ nfs4_read_done_cb(task, hdr);
}
-static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
- data->timestamp = jiffies;
- data->pgio_done_cb = nfs4_read_done_cb;
+ hdr->timestamp = jiffies;
+ hdr->pgio_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
- nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
}
-static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
- &data->args.seq_args,
- &data->res.seq_res,
+ if (nfs4_setup_sequence(NFS_SERVER(hdr->inode),
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
task))
return 0;
- if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context,
+ hdr->rw_ops->rw_mode) == -EIO)
return -EIO;
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
return -EIO;
return 0;
}
-static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
-
- trace_nfs4_write(data, task->tk_status);
- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
+ struct inode *inode = hdr->inode;
+
+ trace_nfs4_write(hdr, task->tk_status);
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+ hdr->args.context->state,
+ NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
}
if (task->tk_status >= 0) {
- renew_lease(NFS_SERVER(inode), data->timestamp);
- nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+ renew_lease(NFS_SERVER(inode), hdr->timestamp);
+ nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
}
return 0;
}
@@ -4134,23 +4219,21 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- if (!nfs4_sequence_done(task, &data->res.seq_res))
+ if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
- if (nfs4_write_stateid_changed(task, &data->args))
+ if (nfs4_write_stateid_changed(task, &hdr->args))
return -EAGAIN;
- return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
- nfs4_write_done_cb(task, data);
+ return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+ nfs4_write_done_cb(task, hdr);
}
static
-bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
+bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
{
- const struct nfs_pgio_header *hdr = data->header;
-
/* Don't request attributes for pNFS or O_DIRECT writes */
- if (data->ds_clp != NULL || hdr->dreq != NULL)
+ if (hdr->ds_clp != NULL || hdr->dreq != NULL)
return false;
/* Otherwise, request attributes if and only if we don't hold
* a delegation
@@ -4158,23 +4241,24 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
}
-static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
- struct nfs_server *server = NFS_SERVER(data->header->inode);
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
- if (!nfs4_write_need_cache_consistency_data(data)) {
- data->args.bitmask = NULL;
- data->res.fattr = NULL;
+ if (!nfs4_write_need_cache_consistency_data(hdr)) {
+ hdr->args.bitmask = NULL;
+ hdr->res.fattr = NULL;
} else
- data->args.bitmask = server->cache_consistency_bitmask;
+ hdr->args.bitmask = server->cache_consistency_bitmask;
- if (!data->pgio_done_cb)
- data->pgio_done_cb = nfs4_write_done_cb;
- data->res.server = server;
- data->timestamp = jiffies;
+ if (!hdr->pgio_done_cb)
+ hdr->pgio_done_cb = nfs4_write_done_cb;
+ hdr->res.server = server;
+ hdr->timestamp = jiffies;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
- nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
}
static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4190,7 +4274,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
struct inode *inode = data->inode;
trace_nfs4_commit(data, task->tk_status);
- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+ NULL, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -4743,7 +4828,8 @@ out:
static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+ struct nfs4_state *state, long *timeout)
{
struct nfs_client *clp = server->nfs_client;
@@ -4753,9 +4839,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (state == NULL)
- break;
- nfs_remove_bad_delegation(state->inode);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
@@ -4793,6 +4876,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
#endif /* CONFIG_NFS_V4_1 */
case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY);
+ rpc_delay(task, nfs4_update_delay(timeout));
+ goto restart_call;
case -NFS4ERR_GRACE:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -4881,6 +4966,18 @@ nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
return scnprintf(buf, len, "tcp");
}
+static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_setclientid *sc = calldata;
+
+ if (task->tk_status == 0)
+ sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
+}
+
+static const struct rpc_call_ops nfs4_setclientid_ops = {
+ .rpc_call_done = nfs4_setclientid_done,
+};
+
/**
* nfs4_proc_setclientid - Negotiate client ID
* @clp: state data structure
@@ -4907,6 +5004,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
.rpc_resp = res,
.rpc_cred = cred,
};
+ struct rpc_task *task;
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_setclientid_ops,
+ .callback_data = &setclientid,
+ .flags = RPC_TASK_TIMEOUT,
+ };
int status;
/* nfs_client_id4 */
@@ -4933,7 +5038,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
dprintk("NFS call setclientid auth=%s, '%.*s'\n",
clp->cl_rpcclient->cl_auth->au_ops->au_name,
setclientid.sc_name_len, setclientid.sc_name);
- status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ status = PTR_ERR(task);
+ goto out;
+ }
+ status = task->tk_status;
+ if (setclientid.sc_cred) {
+ clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
+ put_rpccred(setclientid.sc_cred);
+ }
+ rpc_put_task(task);
+out:
trace_nfs4_setclientid(clp, status);
dprintk("NFS reply setclientid: %d\n", status);
return status;
@@ -4975,6 +5091,9 @@ struct nfs4_delegreturndata {
unsigned long timestamp;
struct nfs_fattr fattr;
int rpc_status;
+ struct inode *inode;
+ bool roc;
+ u32 roc_barrier;
};
static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -4988,7 +5107,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
renew_lease(data->res.server, data->timestamp);
- break;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_BAD_STATEID:
@@ -4996,10 +5114,12 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
task->tk_status = 0;
+ if (data->roc)
+ pnfs_roc_set_barrier(data->inode, data->roc_barrier);
break;
default:
- if (nfs4_async_handle_error(task, data->res.server, NULL) ==
- -EAGAIN) {
+ if (nfs4_async_handle_error(task, data->res.server,
+ NULL, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return;
}
@@ -5009,6 +5129,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
static void nfs4_delegreturn_release(void *calldata)
{
+ struct nfs4_delegreturndata *data = calldata;
+
+ if (data->roc)
+ pnfs_roc_release(data->inode);
kfree(calldata);
}
@@ -5018,6 +5142,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
d_data = (struct nfs4_delegreturndata *)data;
+ if (d_data->roc &&
+ pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
+ return;
+
nfs4_setup_sequence(d_data->res.server,
&d_data->args.seq_args,
&d_data->res.seq_res,
@@ -5061,6 +5189,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
nfs_fattr_init(data->res.fattr);
data->timestamp = jiffies;
data->rpc_status = 0;
+ data->inode = inode;
+ data->roc = list_empty(&NFS_I(inode)->open_files) ?
+ pnfs_roc(inode) : false;
task_setup_data.callback_data = data;
msg.rpc_argp = &data->args;
@@ -5252,7 +5383,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
case -NFS4ERR_EXPIRED:
break;
default:
- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, calldata->server,
+ NULL, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
nfs_release_seqid(calldata->arg.seqid);
@@ -5834,8 +5966,10 @@ struct nfs_release_lockowner_data {
static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_release_lockowner_data *data = calldata;
- nfs40_setup_sequence(data->server,
- &data->args.seq_args, &data->res.seq_res, task);
+ struct nfs_server *server = data->server;
+ nfs40_setup_sequence(server, &data->args.seq_args,
+ &data->res.seq_res, task);
+ data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
data->timestamp = jiffies;
}
@@ -5852,9 +5986,12 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
break;
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_EXPIRED:
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ break;
case -NFS4ERR_LEASE_MOVED:
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server,
+ NULL, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
}
@@ -5872,7 +6009,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
.rpc_release = nfs4_release_lockowner_release,
};
-static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
struct nfs_release_lockowner_data *data;
struct rpc_message msg = {
@@ -5880,11 +6018,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
};
if (server->nfs_client->cl_mvops->minor_version != 0)
- return -EINVAL;
+ return;
data = kmalloc(sizeof(*data), GFP_NOFS);
if (!data)
- return -ENOMEM;
+ return;
data->lsp = lsp;
data->server = server;
data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5895,7 +6033,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
msg.rpc_resp = &data->res;
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
- return 0;
}
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -7229,7 +7366,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
int ret = 0;
if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
- return 0;
+ return -EAGAIN;
task = _nfs41_proc_sequence(clp, cred, false);
if (IS_ERR(task))
ret = PTR_ERR(task);
@@ -7459,14 +7596,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
} else {
LIST_HEAD(head);
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
spin_unlock(&inode->i_lock);
- /* Mark the bad layout state as invalid, then
- * retry using the open stateid. */
pnfs_free_lseg_list(&head);
+
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
}
}
- if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
out:
dprintk("<-- %s\n", __func__);
@@ -7626,7 +7768,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
case 0:
break;
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN)
+ if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
break;
rpc_restart_call_prepare(task);
return;
@@ -7685,54 +7827,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
return status;
}
-/*
- * Retrieve the list of Data Server devices from the MDS.
- */
-static int _nfs4_getdevicelist(struct nfs_server *server,
- const struct nfs_fh *fh,
- struct pnfs_devicelist *devlist)
-{
- struct nfs4_getdevicelist_args args = {
- .fh = fh,
- .layoutclass = server->pnfs_curr_ld->id,
- };
- struct nfs4_getdevicelist_res res = {
- .devlist = devlist,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
- .rpc_argp = &args,
- .rpc_resp = &res,
- };
- int status;
-
- dprintk("--> %s\n", __func__);
- status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
- &res.seq_res, 0);
- dprintk("<-- %s status=%d\n", __func__, status);
- return status;
-}
-
-int nfs4_proc_getdevicelist(struct nfs_server *server,
- const struct nfs_fh *fh,
- struct pnfs_devicelist *devlist)
-{
- struct nfs4_exception exception = { };
- int err;
-
- do {
- err = nfs4_handle_exception(server,
- _nfs4_getdevicelist(server, fh, devlist),
- &exception);
- } while (exception.retry);
-
- dprintk("%s: err=%d, num_devs=%u\n", __func__,
- err, devlist->num_devs);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
-
static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *pdev,
@@ -7805,7 +7899,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
case 0:
break;
default:
- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return;
}
@@ -8101,7 +8195,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
}
@@ -8182,7 +8276,8 @@ static int nfs41_free_stateid(struct nfs_server *server,
return ret;
}
-static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
struct rpc_task *task;
struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
@@ -8190,9 +8285,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta
task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
nfs4_free_lock_state(server, lsp);
if (IS_ERR(task))
- return PTR_ERR(task);
+ return;
rpc_put_task(task);
- return 0;
}
static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -8242,7 +8336,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
.state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
- .recover_open = nfs4_open_expired,
+ .recover_open = nfs40_open_expired,
.recover_lock = nfs4_lock_expired,
.establish_clid = nfs4_init_clientid,
};
@@ -8331,7 +8425,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
- | NFS_CAP_ATOMIC_OPEN_V1,
+ | NFS_CAP_ATOMIC_OPEN_V1
+ | NFS_CAP_SEEK,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa5..e1ba58c3d1a 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
}
nfs_expire_all_delegations(clp);
} else {
+ int ret;
+
/* Queue an asynchronous RENEW. */
- ops->sched_state_renewal(clp, cred, renew_flags);
+ ret = ops->sched_state_renewal(clp, cred, renew_flags);
put_rpccred(cred);
- goto out_exp;
+ switch (ret) {
+ default:
+ goto out_exp;
+ case -EAGAIN:
+ case -ENOMEM:
+ break;
+ }
}
} else {
dprintk("%s: failed to call renewd. Reason: lease not expired \n",
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 42f12118216..5194933ed41 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -787,21 +787,12 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
* that is compatible with current->files
*/
static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
{
struct nfs4_lock_state *pos;
list_for_each_entry(pos, &state->lock_states, ls_locks) {
- if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
+ if (pos->ls_owner != fl_owner)
continue;
- switch (pos->ls_owner.lo_type) {
- case NFS4_POSIX_LOCK_TYPE:
- if (pos->ls_owner.lo_u.posix_owner != fl_owner)
- continue;
- break;
- case NFS4_FLOCK_LOCK_TYPE:
- if (pos->ls_owner.lo_u.flock_owner != fl_pid)
- continue;
- }
atomic_inc(&pos->ls_count);
return pos;
}
@@ -813,7 +804,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
* exists, return an uninitialized one.
*
*/
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
{
struct nfs4_lock_state *lsp;
struct nfs_server *server = state->owner->so_server;
@@ -824,17 +815,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
nfs4_init_seqid_counter(&lsp->ls_seqid);
atomic_set(&lsp->ls_count, 1);
lsp->ls_state = state;
- lsp->ls_owner.lo_type = type;
- switch (lsp->ls_owner.lo_type) {
- case NFS4_FLOCK_LOCK_TYPE:
- lsp->ls_owner.lo_u.flock_owner = fl_pid;
- break;
- case NFS4_POSIX_LOCK_TYPE:
- lsp->ls_owner.lo_u.posix_owner = fl_owner;
- break;
- default:
- goto out_free;
- }
+ lsp->ls_owner = fl_owner;
lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
@@ -857,13 +838,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp
* exists, return an uninitialized one.
*
*/
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
{
struct nfs4_lock_state *lsp, *new = NULL;
for(;;) {
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, owner, pid, type);
+ lsp = __nfs4_find_lock_state(state, owner);
if (lsp != NULL)
break;
if (new != NULL) {
@@ -874,7 +855,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
break;
}
spin_unlock(&state->state_lock);
- new = nfs4_alloc_lock_state(state, owner, pid, type);
+ new = nfs4_alloc_lock_state(state, owner);
if (new == NULL)
return NULL;
}
@@ -935,13 +916,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
if (fl->fl_ops != NULL)
return 0;
- if (fl->fl_flags & FL_POSIX)
- lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
- else if (fl->fl_flags & FL_FLOCK)
- lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
- NFS4_FLOCK_LOCK_TYPE);
- else
- return -EINVAL;
+ lsp = nfs4_get_lock_state(state, fl->fl_owner);
if (lsp == NULL)
return -ENOMEM;
fl->fl_u.nfs4_fl.owner = lsp;
@@ -955,7 +930,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
{
struct nfs4_lock_state *lsp;
fl_owner_t fl_owner;
- pid_t fl_pid;
int ret = -ENOENT;
@@ -966,9 +940,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
goto out;
fl_owner = lockowner->l_owner;
- fl_pid = lockowner->l_pid;
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
+ lsp = __nfs4_find_lock_state(state, fl_owner);
if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
ret = -EIO;
else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
@@ -1732,7 +1705,8 @@ restart:
if (status < 0) {
set_bit(ops->owner_flag_bit, &sp->so_flags);
nfs4_put_state_owner(sp);
- return nfs4_recovery_handle_error(clp, status);
+ status = nfs4_recovery_handle_error(clp, status);
+ return (status != 0) ? status : -EAGAIN;
}
nfs4_put_state_owner(sp);
@@ -1741,7 +1715,7 @@ restart:
spin_unlock(&clp->cl_lock);
}
rcu_read_unlock();
- return status;
+ return 0;
}
static int nfs4_check_lease(struct nfs_client *clp)
@@ -1788,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
break;
case -NFS4ERR_STALE_CLIENTID:
clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
- nfs4_state_clear_reclaim_reboot(clp);
nfs4_state_start_reclaim_reboot(clp);
break;
case -NFS4ERR_CLID_INUSE:
@@ -2372,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
status = nfs4_check_lease(clp);
if (status < 0)
goto out_error;
+ continue;
}
if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
@@ -2393,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
section = "reclaim reboot";
status = nfs4_do_reclaim(clp,
clp->cl_mvops->reboot_recovery_ops);
- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
- continue;
- nfs4_state_end_reclaim_reboot(clp);
- if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+ if (status == -EAGAIN)
continue;
if (status < 0)
goto out_error;
+ nfs4_state_end_reclaim_reboot(clp);
}
/* Now recover expired state... */
@@ -2408,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
section = "reclaim nograce";
status = nfs4_do_reclaim(clp,
clp->cl_mvops->nograce_recovery_ops);
- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
- test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+ if (status == -EAGAIN)
continue;
if (status < 0)
goto out_error;
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 0a744f3a86f..1c32adbe728 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -932,11 +932,11 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
DECLARE_EVENT_CLASS(nfs4_read_event,
TP_PROTO(
- const struct nfs_pgio_data *data,
+ const struct nfs_pgio_header *hdr,
int error
),
- TP_ARGS(data, error),
+ TP_ARGS(hdr, error),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -948,12 +948,12 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
),
TP_fast_assign(
- const struct inode *inode = data->header->inode;
+ const struct inode *inode = hdr->inode;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
- __entry->offset = data->args.offset;
- __entry->count = data->args.count;
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
__entry->error = error;
),
@@ -972,10 +972,10 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
#define DEFINE_NFS4_READ_EVENT(name) \
DEFINE_EVENT(nfs4_read_event, name, \
TP_PROTO( \
- const struct nfs_pgio_data *data, \
+ const struct nfs_pgio_header *hdr, \
int error \
), \
- TP_ARGS(data, error))
+ TP_ARGS(hdr, error))
DEFINE_NFS4_READ_EVENT(nfs4_read);
#ifdef CONFIG_NFS_V4_1
DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
@@ -983,11 +983,11 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
DECLARE_EVENT_CLASS(nfs4_write_event,
TP_PROTO(
- const struct nfs_pgio_data *data,
+ const struct nfs_pgio_header *hdr,
int error
),
- TP_ARGS(data, error),
+ TP_ARGS(hdr, error),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -999,12 +999,12 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
),
TP_fast_assign(
- const struct inode *inode = data->header->inode;
+ const struct inode *inode = hdr->inode;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
- __entry->offset = data->args.offset;
- __entry->count = data->args.count;
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
__entry->error = error;
),
@@ -1024,10 +1024,10 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
#define DEFINE_NFS4_WRITE_EVENT(name) \
DEFINE_EVENT(nfs4_write_event, name, \
TP_PROTO( \
- const struct nfs_pgio_data *data, \
+ const struct nfs_pgio_header *hdr, \
int error \
), \
- TP_ARGS(data, error))
+ TP_ARGS(hdr, error))
DEFINE_NFS4_WRITE_EVENT(nfs4_write);
#ifdef CONFIG_NFS_V4_1
DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 939ae606cfa..206c08a60c7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int);
XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
-#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
- encode_verifier_maxsz)
-#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
- 2 /* nfs_cookie4 gdlr_cookie */ + \
- decode_verifier_maxsz \
- /* verifier4 gdlr_verifier */ + \
- 1 /* gdlr_deviceid_list count */ + \
- XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
- NFS4_DEVICEID4_SIZE) \
- /* gdlr_deviceid_list */ + \
- 1 /* bool gdlr_eof */)
-#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
- XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* layout type */ + \
+ 1 /* maxcount */ + \
+ 1 /* bitmap size */ + \
+ 1 /* notification bitmap length */ + \
+ 1 /* notification bitmap, word 0 */)
#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
1 /* layout type */ + \
1 /* opaque devaddr4 length */ + \
/* devaddr4 payload is read into page */ \
1 /* notification bitmap length */ + \
- 1 /* notification bitmap */)
+ 1 /* notification bitmap, word 0 */)
#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
encode_stateid_maxsz)
#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
@@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int);
2 /* last byte written */ + \
1 /* nt_timechanged (false) */ + \
1 /* layoutupdate4 layout type */ + \
- 1 /* NULL filelayout layoutupdate4 payload */)
+ 1 /* layoutupdate4 opaqueue len */)
+ /* the actual content of layoutupdate4 should
+ be allocated by drivers and spliced in
+ using xdr_write_pages */
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
@@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int);
#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_reclaim_complete_maxsz)
-#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
- encode_sequence_maxsz + \
- encode_putfh_maxsz + \
- encode_getdevicelist_maxsz)
-#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
- decode_sequence_maxsz + \
- decode_putfh_maxsz + \
- decode_getdevicelist_maxsz)
#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz +\
encode_getdeviceinfo_maxsz)
@@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr,
#ifdef CONFIG_NFS_V4_1
static void
-encode_getdevicelist(struct xdr_stream *xdr,
- const struct nfs4_getdevicelist_args *args,
- struct compound_hdr *hdr)
-{
- __be32 *p;
- nfs4_verifier dummy = {
- .data = "dummmmmy",
- };
-
- encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
- p = reserve_space(xdr, 16);
- *p++ = cpu_to_be32(args->layoutclass);
- *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
- xdr_encode_hyper(p, 0ULL); /* cookie */
- encode_nfs4_verifier(xdr, &dummy);
-}
-
-static void
encode_getdeviceinfo(struct xdr_stream *xdr,
const struct nfs4_getdeviceinfo_args *args,
struct compound_hdr *hdr)
@@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
__be32 *p;
encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
- p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(args->pdev->layout_type);
*p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
- *p++ = cpu_to_be32(0); /* bitmap length 0 */
+
+ p = reserve_space(xdr, 4 + 4);
+ *p++ = cpu_to_be32(1); /* bitmap length */
+ *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
}
static void
@@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr,
static int
encode_layoutcommit(struct xdr_stream *xdr,
struct inode *inode,
- const struct nfs4_layoutcommit_args *args,
+ struct nfs4_layoutcommit_args *args,
struct compound_hdr *hdr)
{
__be32 *p;
@@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
- if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+ if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
NFS_I(inode)->layout, xdr, args);
- else
- encode_uint32(xdr, 0); /* no layout-type payload */
+ } else {
+ encode_uint32(xdr, args->layoutupdate_len);
+ if (args->layoutupdate_pages) {
+ xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+ args->layoutupdate_len);
+ }
+ }
return 0;
}
@@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
}
/*
- * Encode GETDEVICELIST request
- */
-static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
- struct xdr_stream *xdr,
- struct nfs4_getdevicelist_args *args)
-{
- struct compound_hdr hdr = {
- .minorversion = nfs4_xdr_minorversion(&args->seq_args),
- };
-
- encode_compound_hdr(xdr, req, &hdr);
- encode_sequence(xdr, &args->seq_args, &hdr);
- encode_putfh(xdr, args->fh, &hdr);
- encode_getdevicelist(xdr, args, &hdr);
- encode_nops(&hdr);
-}
-
-/*
* Encode GETDEVICEINFO request
*/
static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5765,54 +5726,6 @@ out_overflow:
}
#if defined(CONFIG_NFS_V4_1)
-/*
- * TODO: Need to handle case when EOF != true;
- */
-static int decode_getdevicelist(struct xdr_stream *xdr,
- struct pnfs_devicelist *res)
-{
- __be32 *p;
- int status, i;
- nfs4_verifier verftemp;
-
- status = decode_op_hdr(xdr, OP_GETDEVICELIST);
- if (status)
- return status;
-
- p = xdr_inline_decode(xdr, 8 + 8 + 4);
- if (unlikely(!p))
- goto out_overflow;
-
- /* TODO: Skip cookie for now */
- p += 2;
-
- /* Read verifier */
- p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
-
- res->num_devs = be32_to_cpup(p);
-
- dprintk("%s: num_dev %d\n", __func__, res->num_devs);
-
- if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
- printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
- __func__, res->num_devs);
- return -EIO;
- }
-
- p = xdr_inline_decode(xdr,
- res->num_devs * NFS4_DEVICEID4_SIZE + 4);
- if (unlikely(!p))
- goto out_overflow;
- for (i = 0; i < res->num_devs; i++)
- p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
- NFS4_DEVICEID4_SIZE);
- res->eof = be32_to_cpup(p);
- return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
-}
-
static int decode_getdeviceinfo(struct xdr_stream *xdr,
struct pnfs_device *pdev)
{
@@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4 * len);
if (unlikely(!p))
goto out_overflow;
- for (i = 0; i < len; i++, p++) {
- if (be32_to_cpup(p)) {
- dprintk("%s: notifications not supported\n",
+
+ if (be32_to_cpup(p++) &
+ ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
+ dprintk("%s: unsupported notification\n",
+ __func__);
+ }
+
+ for (i = 1; i < len; i++) {
+ if (be32_to_cpup(p++)) {
+ dprintk("%s: unsupported notification\n",
__func__);
return -EIO;
}
@@ -7092,33 +7012,7 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
if (!status)
status = decode_sequence(xdr, &res->seq_res, rqstp);
if (!status)
- status = decode_reclaim_complete(xdr, (void *)NULL);
- return status;
-}
-
-/*
- * Decode GETDEVICELIST response
- */
-static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
- struct xdr_stream *xdr,
- struct nfs4_getdevicelist_res *res)
-{
- struct compound_hdr hdr;
- int status;
-
- dprintk("encoding getdevicelist!\n");
-
- status = decode_compound_hdr(xdr, &hdr);
- if (status != 0)
- goto out;
- status = decode_sequence(xdr, &res->seq_res, rqstp);
- if (status != 0)
- goto out;
- status = decode_putfh(xdr);
- if (status != 0)
- goto out;
- status = decode_getdevicelist(xdr, res->devlist);
-out:
+ status = decode_reclaim_complete(xdr, NULL);
return status;
}
@@ -7427,6 +7321,10 @@ nfs4_stat_to_errno(int stat)
return -stat;
}
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42xdr.c"
+#endif /* CONFIG_NFS_V4_2 */
+
#define PROC(proc, argtype, restype) \
[NFSPROC4_CLNT_##proc] = { \
.p_proc = NFSPROC4_COMPOUND, \
@@ -7490,11 +7388,13 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
- PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
PROC(BIND_CONN_TO_SESSION,
enc_bind_conn_to_session, dec_bind_conn_to_session),
PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid),
#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+ PROC(SEEK, enc_seek, dec_seek),
+#endif /* CONFIG_NFS_V4_2 */
};
const struct rpc_version nfs_version4 = {
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 611320753db..9e5bc42180e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -5,7 +5,7 @@
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
kfree(de);
}
-static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
- const struct nfs4_deviceid *d_id)
-{
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *de;
-
- d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
- if (!d)
- return NULL;
-
- de = container_of(d, struct objio_dev_ent, id_node);
- return de;
-}
-
-static struct objio_dev_ent *
-_dev_list_add(const struct nfs_server *nfss,
- const struct nfs4_deviceid *d_id, struct osd_dev *od,
- gfp_t gfp_flags)
-{
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
- struct objio_dev_ent *n;
-
- if (!de) {
- dprintk("%s: -ENOMEM od=%p\n", __func__, od);
- return NULL;
- }
-
- dprintk("%s: Adding od=%p\n", __func__, od);
- nfs4_init_deviceid_node(&de->id_node,
- nfss->pnfs_curr_ld,
- nfss->nfs_client,
- d_id);
- de->od.od = od;
-
- d = nfs4_insert_deviceid_node(&de->id_node);
- n = container_of(d, struct objio_dev_ent, id_node);
- if (n != de) {
- dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
- objio_free_deviceid_node(&de->id_node);
- de = n;
- }
-
- return de;
-}
-
struct objio_segment {
struct pnfs_layout_segment lseg;
@@ -130,29 +84,24 @@ struct objio_state {
/* Send and wait for a get_device_info of devices in the layout,
then look them up with the osd_initiator library */
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
- gfp_t gfp_flags)
+struct nfs4_deviceid_node *
+objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
{
struct pnfs_osd_deviceaddr *deviceaddr;
- struct objio_dev_ent *ode;
+ struct objio_dev_ent *ode = NULL;
struct osd_dev *od;
struct osd_dev_info odi;
bool retry_flag = true;
+ __be32 *p;
int err;
- ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
- if (ode) {
- objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
- return 0;
- }
+ deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
+ if (!deviceaddr)
+ return NULL;
- err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
- if (unlikely(err)) {
- dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
- __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
- return err;
- }
+ p = page_address(pdev->pages[0]);
+ pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
odi.systemid_len = deviceaddr->oda_systemid.len;
if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
goto out;
}
- ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
- gfp_flags);
- objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
dprintk("Adding new dev_id(%llx:%llx)\n",
- _DEVID_LO(d_id), _DEVID_HI(d_id));
+ _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
+
+ ode = kzalloc(sizeof(*ode), gfp_flags);
+ if (!ode) {
+ dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+ goto out;
+ }
+
+ nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
+ kfree(deviceaddr);
+
+ ode->od.od = od;
+ return &ode->id_node;
+
out:
- objlayout_put_deviceinfo(deviceaddr);
- return err;
+ kfree(deviceaddr);
+ return NULL;
}
static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct xdr_stream *xdr,
gfp_t gfp_flags)
{
+ struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
struct objio_segment *objio_seg;
struct pnfs_osd_xdr_decode_layout_iter iter;
struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
objio_seg->oc.first_dev = layout.olo_comps_index;
cur_comp = 0;
while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+ struct nfs4_deviceid_node *d;
+ struct objio_dev_ent *ode;
+
copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
- err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
- &src_comp.oc_object_id.oid_device_id,
- gfp_flags);
- if (err)
+
+ d = nfs4_find_get_deviceid(server,
+ &src_comp.oc_object_id.oid_device_id,
+ pnfslay->plh_lc_cred, gfp_flags);
+ if (!d) {
+ err = -ENXIO;
goto err;
- ++cur_comp;
+ }
+
+ ode = container_of(d, struct objio_dev_ent, id_node);
+ objio_seg->oc.ods[cur_comp++] = &ode->od;
}
/* pnfs_osd_xdr_decode_layout_comp returns false on error */
if (unlikely(err))
@@ -439,22 +407,21 @@ static void _read_done(struct ore_io_state *ios, void *private)
objlayout_read_done(&objios->oir, status, objios->sync);
}
-int objio_read_pagelist(struct nfs_pgio_data *rdata)
+int objio_read_pagelist(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = rdata->header;
struct objio_state *objios;
int ret;
ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
- hdr->lseg, rdata->args.pages, rdata->args.pgbase,
- rdata->args.offset, rdata->args.count, rdata,
+ hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+ hdr->args.offset, hdr->args.count, hdr,
GFP_KERNEL, &objios);
if (unlikely(ret))
return ret;
objios->ios->done = _read_done;
dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
- rdata->args.offset, rdata->args.count);
+ hdr->args.offset, hdr->args.count);
ret = ore_read(objios->ios);
if (unlikely(ret))
objio_free_result(&objios->oir);
@@ -487,11 +454,11 @@ static void _write_done(struct ore_io_state *ios, void *private)
static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
{
struct objio_state *objios = priv;
- struct nfs_pgio_data *wdata = objios->oir.rpcdata;
- struct address_space *mapping = wdata->header->inode->i_mapping;
+ struct nfs_pgio_header *hdr = objios->oir.rpcdata;
+ struct address_space *mapping = hdr->inode->i_mapping;
pgoff_t index = offset / PAGE_SIZE;
struct page *page;
- loff_t i_size = i_size_read(wdata->header->inode);
+ loff_t i_size = i_size_read(hdr->inode);
if (offset >= i_size) {
*uptodate = true;
@@ -531,15 +498,14 @@ static const struct _ore_r4w_op _r4w_op = {
.put_page = &__r4w_put_page,
};
-int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
+int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
{
- struct nfs_pgio_header *hdr = wdata->header;
struct objio_state *objios;
int ret;
ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
- hdr->lseg, wdata->args.pages, wdata->args.pgbase,
- wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+ hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+ hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
&objios);
if (unlikely(ret))
return ret;
@@ -551,7 +517,7 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
objios->ios->done = _write_done;
dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
- wdata->args.offset, wdata->args.count);
+ hdr->args.offset, hdr->args.count);
ret = ore_write(objios->ios);
if (unlikely(ret)) {
objio_free_result(&objios->oir);
@@ -655,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
.flags = PNFS_LAYOUTRET_ON_SETATTR |
PNFS_LAYOUTRET_ON_ERROR,
+ .max_deviceinfo_size = PAGE_SIZE,
.owner = THIS_MODULE,
.alloc_layout_hdr = objlayout_alloc_layout_hdr,
.free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 765d3f54e98..919efd4a1a2 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -5,7 +5,7 @@
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
@@ -229,36 +229,36 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
static void _rpc_read_complete(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_pgio_data *rdata;
+ struct nfs_pgio_header *hdr;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- rdata = container_of(task, struct nfs_pgio_data, task);
+ hdr = container_of(task, struct nfs_pgio_header, task);
- pnfs_ld_read_done(rdata);
+ pnfs_ld_read_done(hdr);
}
void
objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_pgio_data *rdata = oir->rpcdata;
+ struct nfs_pgio_header *hdr = oir->rpcdata;
- oir->status = rdata->task.tk_status = status;
+ oir->status = hdr->task.tk_status = status;
if (status >= 0)
- rdata->res.count = status;
+ hdr->res.count = status;
else
- rdata->header->pnfs_error = status;
+ hdr->pnfs_error = status;
objlayout_iodone(oir);
/* must not use oir after this point */
dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
- status, rdata->res.eof, sync);
+ status, hdr->res.eof, sync);
if (sync)
- pnfs_ld_read_done(rdata);
+ pnfs_ld_read_done(hdr);
else {
- INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
- schedule_work(&rdata->task.u.tk_work);
+ INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
+ schedule_work(&hdr->task.u.tk_work);
}
}
@@ -266,12 +266,11 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
* Perform sync or async reads.
*/
enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_pgio_data *rdata)
+objlayout_read_pagelist(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = rdata->header;
struct inode *inode = hdr->inode;
- loff_t offset = rdata->args.offset;
- size_t count = rdata->args.count;
+ loff_t offset = hdr->args.offset;
+ size_t count = hdr->args.count;
int err;
loff_t eof;
@@ -279,23 +278,23 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
if (unlikely(offset + count > eof)) {
if (offset >= eof) {
err = 0;
- rdata->res.count = 0;
- rdata->res.eof = 1;
+ hdr->res.count = 0;
+ hdr->res.eof = 1;
/*FIXME: do we need to call pnfs_ld_read_done() */
goto out;
}
count = eof - offset;
}
- rdata->res.eof = (offset + count) >= eof;
- _fix_verify_io_params(hdr->lseg, &rdata->args.pages,
- &rdata->args.pgbase,
- rdata->args.offset, rdata->args.count);
+ hdr->res.eof = (offset + count) >= eof;
+ _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+ &hdr->args.pgbase,
+ hdr->args.offset, hdr->args.count);
dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
- __func__, inode->i_ino, offset, count, rdata->res.eof);
+ __func__, inode->i_ino, offset, count, hdr->res.eof);
- err = objio_read_pagelist(rdata);
+ err = objio_read_pagelist(hdr);
out:
if (unlikely(err)) {
hdr->pnfs_error = err;
@@ -312,38 +311,38 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
static void _rpc_write_complete(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_pgio_data *wdata;
+ struct nfs_pgio_header *hdr;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- wdata = container_of(task, struct nfs_pgio_data, task);
+ hdr = container_of(task, struct nfs_pgio_header, task);
- pnfs_ld_write_done(wdata);
+ pnfs_ld_write_done(hdr);
}
void
objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_pgio_data *wdata = oir->rpcdata;
+ struct nfs_pgio_header *hdr = oir->rpcdata;
- oir->status = wdata->task.tk_status = status;
+ oir->status = hdr->task.tk_status = status;
if (status >= 0) {
- wdata->res.count = status;
- wdata->verf.committed = oir->committed;
+ hdr->res.count = status;
+ hdr->verf.committed = oir->committed;
} else {
- wdata->header->pnfs_error = status;
+ hdr->pnfs_error = status;
}
objlayout_iodone(oir);
/* must not use oir after this point */
dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
- status, wdata->verf.committed, sync);
+ status, hdr->verf.committed, sync);
if (sync)
- pnfs_ld_write_done(wdata);
+ pnfs_ld_write_done(hdr);
else {
- INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
- schedule_work(&wdata->task.u.tk_work);
+ INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
+ schedule_work(&hdr->task.u.tk_work);
}
}
@@ -351,17 +350,15 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
* Perform sync or async writes.
*/
enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_pgio_data *wdata,
- int how)
+objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
{
- struct nfs_pgio_header *hdr = wdata->header;
int err;
- _fix_verify_io_params(hdr->lseg, &wdata->args.pages,
- &wdata->args.pgbase,
- wdata->args.offset, wdata->args.count);
+ _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+ &hdr->args.pgbase,
+ hdr->args.offset, hdr->args.count);
- err = objio_write_pagelist(wdata, how);
+ err = objio_write_pagelist(hdr, how);
if (unlikely(err)) {
hdr->pnfs_error = err;
dprintk("%s: Returned Error %d\n", __func__, err);
@@ -577,76 +574,6 @@ loop_done:
dprintk("%s: Return\n", __func__);
}
-
-/*
- * Get Device Info API for io engines
- */
-struct objlayout_deviceinfo {
- struct page *page;
- struct pnfs_osd_deviceaddr da; /* This must be last */
-};
-
-/* Initialize and call nfs_getdeviceinfo, then decode and return a
- * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
- * should be called.
- */
-int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
- gfp_t gfp_flags)
-{
- struct objlayout_deviceinfo *odi;
- struct pnfs_device pd;
- struct page *page, **pages;
- u32 *p;
- int err;
-
- page = alloc_page(gfp_flags);
- if (!page)
- return -ENOMEM;
-
- pages = &page;
- pd.pages = pages;
-
- memcpy(&pd.dev_id, d_id, sizeof(*d_id));
- pd.layout_type = LAYOUT_OSD2_OBJECTS;
- pd.pages = &page;
- pd.pgbase = 0;
- pd.pglen = PAGE_SIZE;
- pd.mincount = 0;
- pd.maxcount = PAGE_SIZE;
-
- err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
- pnfslay->plh_lc_cred);
- dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
- if (err)
- goto err_out;
-
- p = page_address(page);
- odi = kzalloc(sizeof(*odi), gfp_flags);
- if (!odi) {
- err = -ENOMEM;
- goto err_out;
- }
- pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
- odi->page = page;
- *deviceaddr = &odi->da;
- return 0;
-
-err_out:
- __free_page(page);
- return err;
-}
-
-void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
-{
- struct objlayout_deviceinfo *odi = container_of(deviceaddr,
- struct objlayout_deviceinfo,
- da);
-
- __free_page(odi->page);
- kfree(odi);
-}
-
enum {
OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 01e041029a6..2641dbad345 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -6,7 +6,7 @@
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
*/
extern void objio_free_result(struct objlayout_io_res *oir);
-extern int objio_read_pagelist(struct nfs_pgio_data *rdata);
-extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);
+extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
+extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
/*
* callback API
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
extern void objlayout_write_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);
-extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
- gfp_t gfp_flags);
-extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
-
/*
* exported generic objects function vectors
*/
@@ -168,10 +163,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
extern void objlayout_free_lseg(struct pnfs_layout_segment *);
extern enum pnfs_try_status objlayout_read_pagelist(
- struct nfs_pgio_data *);
+ struct nfs_pgio_header *);
extern enum pnfs_try_status objlayout_write_pagelist(
- struct nfs_pgio_data *,
+ struct nfs_pgio_header *,
int how);
extern void objlayout_encode_layoutcommit(
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index b3918f7ac34..f093c7ec983 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -5,7 +5,7 @@
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 0be5050638f..ed0db61f854 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -116,7 +116,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c)
if (atomic_read(&c->io_count) == 0)
break;
ret = nfs_wait_bit_killable(&q.key);
- } while (atomic_read(&c->io_count) != 0);
+ } while (atomic_read(&c->io_count) != 0 && !ret);
finish_wait(wq, &q.wait);
return ret;
}
@@ -139,18 +139,49 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
/*
* nfs_page_group_lock - lock the head of the page group
* @req - request in group that is to be locked
+ * @nonblock - if true don't block waiting for lock
*
* this lock must be held if modifying the page group list
+ *
+ * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
+ * result from wait_on_bit_lock
+ *
+ * NOTE: calling with nonblock=false should always have set the
+ * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
+ * with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
+ */
+int
+nfs_page_group_lock(struct nfs_page *req, bool nonblock)
+{
+ struct nfs_page *head = req->wb_head;
+
+ WARN_ON_ONCE(head != head->wb_head);
+
+ if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
+ return 0;
+
+ if (!nonblock)
+ return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+ TASK_UNINTERRUPTIBLE);
+
+ return -EAGAIN;
+}
+
+/*
+ * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
+ * @req - a request in the group
+ *
+ * This is a blocking call to wait for the group lock to be cleared.
*/
void
-nfs_page_group_lock(struct nfs_page *req)
+nfs_page_group_lock_wait(struct nfs_page *req)
{
struct nfs_page *head = req->wb_head;
WARN_ON_ONCE(head != head->wb_head);
- wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
- TASK_UNINTERRUPTIBLE);
+ wait_on_bit(&head->wb_flags, PG_HEADLOCK,
+ TASK_UNINTERRUPTIBLE);
}
/*
@@ -211,7 +242,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
{
bool ret;
- nfs_page_group_lock(req);
+ nfs_page_group_lock(req, false);
ret = nfs_page_group_sync_on_bit_locked(req, bit);
nfs_page_group_unlock(req);
@@ -450,127 +481,85 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
return 0;
}
+ /*
+ * Limit the request size so that we can still allocate a page array
+ * for it without upsetting the slab allocator.
+ */
+ if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+ sizeof(struct page) > PAGE_SIZE)
+ return 0;
+
return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
}
EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
-static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr)
-{
- return container_of(hdr, struct nfs_rw_header, header);
-}
-
-/**
- * nfs_rw_header_alloc - Allocate a header for a read or write
- * @ops: Read or write function vector
- */
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
{
- struct nfs_rw_header *header = ops->rw_alloc_header();
-
- if (header) {
- struct nfs_pgio_header *hdr = &header->header;
+ struct nfs_pgio_header *hdr = ops->rw_alloc_header();
+ if (hdr) {
INIT_LIST_HEAD(&hdr->pages);
spin_lock_init(&hdr->lock);
- atomic_set(&hdr->refcnt, 0);
hdr->rw_ops = ops;
}
- return header;
+ return hdr;
}
-EXPORT_SYMBOL_GPL(nfs_rw_header_alloc);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
/*
- * nfs_rw_header_free - Free a read or write header
+ * nfs_pgio_header_free - Free a read or write header
* @hdr: The header to free
*/
-void nfs_rw_header_free(struct nfs_pgio_header *hdr)
-{
- hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
-}
-EXPORT_SYMBOL_GPL(nfs_rw_header_free);
-
-/**
- * nfs_pgio_data_alloc - Allocate pageio data
- * @hdr: The header making a request
- * @pagecount: Number of pages to create
- */
-static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr,
- unsigned int pagecount)
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_data *data, *prealloc;
-
- prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
- if (prealloc->header == NULL)
- data = prealloc;
- else
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
-
- if (nfs_pgarray_set(&data->pages, pagecount)) {
- data->header = hdr;
- atomic_inc(&hdr->refcnt);
- } else {
- if (data != prealloc)
- kfree(data);
- data = NULL;
- }
-out:
- return data;
+ hdr->rw_ops->rw_free_header(hdr);
}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
/**
- * nfs_pgio_data_release - Properly free pageio data
- * @data: The data to release
+ * nfs_pgio_data_destroy - make @hdr suitable for reuse
+ *
+ * Frees memory and releases refs from nfs_generic_pgio, so that it may
+ * be called again.
+ *
+ * @hdr: A header that has had nfs_generic_pgio called
*/
-void nfs_pgio_data_release(struct nfs_pgio_data *data)
+void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
- struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr);
-
- put_nfs_open_context(data->args.context);
- if (data->pages.pagevec != data->pages.page_array)
- kfree(data->pages.pagevec);
- if (data == &pageio_header->rpc_data) {
- data->header = NULL;
- data = NULL;
- }
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- /* Note: we only free the rpc_task after callbacks are done.
- * See the comment in rpc_free_task() for why
- */
- kfree(data);
+ if (hdr->args.context)
+ put_nfs_open_context(hdr->args.context);
+ if (hdr->page_array.pagevec != hdr->page_array.page_array)
+ kfree(hdr->page_array.pagevec);
}
-EXPORT_SYMBOL_GPL(nfs_pgio_data_release);
+EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
/**
* nfs_pgio_rpcsetup - Set up arguments for a pageio call
- * @data: The pageio data
+ * @hdr: The pageio hdr
* @count: Number of bytes to read
* @offset: Initial offset
* @how: How to commit data (writes only)
* @cinfo: Commit information for the call (writes only)
*/
-static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
unsigned int count, unsigned int offset,
int how, struct nfs_commit_info *cinfo)
{
- struct nfs_page *req = data->header->req;
+ struct nfs_page *req = hdr->req;
/* Set up the RPC argument and reply structs
- * NB: take care not to mess about with data->commit et al. */
+ * NB: take care not to mess about with hdr->commit et al. */
- data->args.fh = NFS_FH(data->header->inode);
- data->args.offset = req_offset(req) + offset;
+ hdr->args.fh = NFS_FH(hdr->inode);
+ hdr->args.offset = req_offset(req) + offset;
/* pnfs_set_layoutcommit needs this */
- data->mds_offset = data->args.offset;
- data->args.pgbase = req->wb_pgbase + offset;
- data->args.pages = data->pages.pagevec;
- data->args.count = count;
- data->args.context = get_nfs_open_context(req->wb_context);
- data->args.lock_context = req->wb_lock_context;
- data->args.stable = NFS_UNSTABLE;
+ hdr->mds_offset = hdr->args.offset;
+ hdr->args.pgbase = req->wb_pgbase + offset;
+ hdr->args.pages = hdr->page_array.pagevec;
+ hdr->args.count = count;
+ hdr->args.context = get_nfs_open_context(req->wb_context);
+ hdr->args.lock_context = req->wb_lock_context;
+ hdr->args.stable = NFS_UNSTABLE;
switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
case 0:
break;
@@ -578,59 +567,59 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
if (nfs_reqs_to_commit(cinfo))
break;
default:
- data->args.stable = NFS_FILE_SYNC;
+ hdr->args.stable = NFS_FILE_SYNC;
}
- data->res.fattr = &data->fattr;
- data->res.count = count;
- data->res.eof = 0;
- data->res.verf = &data->verf;
- nfs_fattr_init(&data->fattr);
+ hdr->res.fattr = &hdr->fattr;
+ hdr->res.count = count;
+ hdr->res.eof = 0;
+ hdr->res.verf = &hdr->verf;
+ nfs_fattr_init(&hdr->fattr);
}
/**
- * nfs_pgio_prepare - Prepare pageio data to go over the wire
+ * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
* @task: The current task
- * @calldata: pageio data to prepare
+ * @calldata: pageio header to prepare
*/
static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
{
- struct nfs_pgio_data *data = calldata;
+ struct nfs_pgio_header *hdr = calldata;
int err;
- err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data);
+ err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
if (err)
rpc_exit(task, err);
}
-int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data,
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops, int how, int flags)
{
struct rpc_task *task;
struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
- .rpc_cred = data->header->cred,
+ .rpc_argp = &hdr->args,
+ .rpc_resp = &hdr->res,
+ .rpc_cred = hdr->cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = clnt,
- .task = &data->task,
+ .task = &hdr->task,
.rpc_message = &msg,
.callback_ops = call_ops,
- .callback_data = data,
+ .callback_data = hdr,
.workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC | flags,
};
int ret = 0;
- data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how);
+ hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
dprintk("NFS: %5u initiated pgio call "
"(req %s/%llu, %u bytes @ offset %llu)\n",
- data->task.tk_pid,
- data->header->inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(data->header->inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task)) {
@@ -657,22 +646,23 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
set_bit(NFS_IOHDR_REDO, &hdr->flags);
- nfs_pgio_data_release(hdr->data);
- hdr->data = NULL;
+ nfs_pgio_data_destroy(hdr);
+ hdr->completion_ops->completion(hdr);
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
return -ENOMEM;
}
/**
* nfs_pgio_release - Release pageio data
- * @calldata: The pageio data to release
+ * @calldata: The pageio header to release
*/
static void nfs_pgio_release(void *calldata)
{
- struct nfs_pgio_data *data = calldata;
- if (data->header->rw_ops->rw_release)
- data->header->rw_ops->rw_release(data);
- nfs_pgio_data_release(data);
+ struct nfs_pgio_header *hdr = calldata;
+ if (hdr->rw_ops->rw_release)
+ hdr->rw_ops->rw_release(hdr);
+ nfs_pgio_data_destroy(hdr);
+ hdr->completion_ops->completion(hdr);
}
/**
@@ -713,22 +703,22 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init);
/**
* nfs_pgio_result - Basic pageio error handling
* @task: The task that ran
- * @calldata: Pageio data to check
+ * @calldata: Pageio header to check
*/
static void nfs_pgio_result(struct rpc_task *task, void *calldata)
{
- struct nfs_pgio_data *data = calldata;
- struct inode *inode = data->header->inode;
+ struct nfs_pgio_header *hdr = calldata;
+ struct inode *inode = hdr->inode;
dprintk("NFS: %s: %5u, (status %d)\n", __func__,
task->tk_pid, task->tk_status);
- if (data->header->rw_ops->rw_done(task, data, inode) != 0)
+ if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
return;
if (task->tk_status < 0)
- nfs_set_pgio_error(data->header, task->tk_status, data->args.offset);
+ nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
else
- data->header->rw_ops->rw_result(task, data);
+ hdr->rw_ops->rw_result(task, hdr);
}
/*
@@ -743,32 +733,41 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
struct nfs_page *req;
- struct page **pages;
- struct nfs_pgio_data *data;
+ struct page **pages,
+ *last_page;
struct list_head *head = &desc->pg_list;
struct nfs_commit_info cinfo;
+ unsigned int pagecount, pageused;
- data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base,
- desc->pg_count));
- if (!data)
+ pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
+ if (!nfs_pgarray_set(&hdr->page_array, pagecount))
return nfs_pgio_error(desc, hdr);
nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
- pages = data->pages.pagevec;
+ pages = hdr->page_array.pagevec;
+ last_page = NULL;
+ pageused = 0;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_list_add_request(req, &hdr->pages);
- *pages++ = req->wb_page;
+
+ if (!last_page || last_page != req->wb_page) {
+ pageused++;
+ if (pageused > pagecount)
+ break;
+ *pages++ = last_page = req->wb_page;
+ }
}
+ if (WARN_ON_ONCE(pageused != pagecount))
+ return nfs_pgio_error(desc, hdr);
if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
desc->pg_ioflags &= ~FLUSH_COND_STABLE;
/* Set up the argument struct */
- nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
- hdr->data = data;
+ nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
desc->pg_rpc_callops = &nfs_pgio_common_ops;
return 0;
}
@@ -776,25 +775,20 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
{
- struct nfs_rw_header *rw_hdr;
struct nfs_pgio_header *hdr;
int ret;
- rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops);
- if (!rw_hdr) {
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
return -ENOMEM;
}
- hdr = &rw_hdr->header;
- nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
- atomic_inc(&hdr->refcnt);
+ nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
if (ret == 0)
ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
- hdr->data, desc->pg_rpc_callops,
+ hdr, desc->pg_rpc_callops,
desc->pg_ioflags, 0);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
return ret;
}
@@ -837,6 +831,14 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
return false;
if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
return false;
+ if (req->wb_page == prev->wb_page) {
+ if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
+ return false;
+ } else {
+ if (req->wb_pgbase != 0 ||
+ prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+ return false;
+ }
}
size = pgio->pg_ops->pg_test(pgio, prev, req);
WARN_ON_ONCE(size > req->wb_bytes);
@@ -908,7 +910,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
unsigned int bytes_left = 0;
unsigned int offset, pgbase;
- nfs_page_group_lock(req);
+ nfs_page_group_lock(req, false);
subreq = req;
bytes_left = subreq->wb_bytes;
@@ -930,7 +932,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (desc->pg_recoalesce)
return 0;
/* retry add_request for this subreq */
- nfs_page_group_lock(req);
+ nfs_page_group_lock(req, false);
continue;
}
@@ -1005,7 +1007,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
} while (ret);
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_pageio_add_request);
+
+/*
+ * nfs_pageio_resend - Transfer requests to new descriptor and resend
+ * @hdr - the pgio header to move request from
+ * @desc - the pageio descriptor to add requests to
+ *
+ * Try to move each request (nfs_page) from @hdr to @desc then attempt
+ * to send them.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ LIST_HEAD(failed);
+
+ desc->pg_dreq = hdr->dreq;
+ while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+ nfs_list_remove_request(req);
+ if (!nfs_pageio_add_request(desc, req))
+ nfs_list_add_request(req, &failed);
+ }
+ nfs_pageio_complete(desc);
+ if (!list_empty(&failed)) {
+ list_move(&failed, &hdr->pages);
+ return -EIO;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_resend);
/**
* nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
@@ -1021,7 +1054,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
break;
}
}
-EXPORT_SYMBOL_GPL(nfs_pageio_complete);
/**
* nfs_pageio_cond_complete - Conditional I/O completion
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a8914b33561..0a5dda4d85c 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -361,6 +361,44 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
}
EXPORT_SYMBOL_GPL(pnfs_put_lseg);
+static void pnfs_free_lseg_async_work(struct work_struct *work)
+{
+ struct pnfs_layout_segment *lseg;
+ struct pnfs_layout_hdr *lo;
+
+ lseg = container_of(work, struct pnfs_layout_segment, pls_work);
+ lo = lseg->pls_layout;
+
+ pnfs_free_lseg(lseg);
+ pnfs_put_layout_hdr(lo);
+}
+
+static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
+{
+ INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
+ schedule_work(&lseg->pls_work);
+}
+
+void
+pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
+{
+ if (!lseg)
+ return;
+
+ assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
+
+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+ atomic_read(&lseg->pls_refcount),
+ test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ pnfs_get_layout_hdr(lo);
+ pnfs_layout_remove_lseg(lo, lseg);
+ pnfs_free_lseg_async(lseg);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
+
static u64
end_offset(u64 start, u64 len)
{
@@ -577,6 +615,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
dprintk("%s freeing layout for inode %lu\n", __func__,
lo->plh_inode->i_ino);
inode = lo->plh_inode;
+
+ pnfs_layoutcommit_inode(inode, false);
+
spin_lock(&inode->i_lock);
list_del_init(&lo->plh_bulk_destroy);
lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
@@ -665,17 +706,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
return (s32)(s1 - s2) > 0;
}
-static void
-pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
- const nfs4_stateid *new,
- struct list_head *free_me_list)
-{
- if (nfs4_stateid_match_other(&lo->plh_stateid, new))
- return;
- /* Layout is new! Kill existing layout segments */
- pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
-}
-
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -732,7 +762,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
status = -EAGAIN;
} else if (!nfs4_valid_open_stateid(open_state)) {
status = -EBADF;
- } else if (list_empty(&lo->plh_segs)) {
+ } else if (list_empty(&lo->plh_segs) ||
+ test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
int seq;
do {
@@ -847,6 +878,16 @@ _pnfs_return_layout(struct inode *ino)
empty = list_empty(&lo->plh_segs);
pnfs_clear_layoutcommit(ino, &tmp_list);
pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
+ }
+
/* Don't send a LAYOUTRETURN if list was initially empty */
if (empty) {
spin_unlock(&ino->i_lock);
@@ -854,6 +895,8 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout segments to return\n", __func__);
goto out;
}
+
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
lo->plh_block_lgets++;
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
@@ -1341,25 +1384,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out;
}
+ init_lseg(lo, lseg);
+ lseg->pls_range = res->range;
+
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
dprintk("%s forget reply due to recall\n", __func__);
goto out_forget_reply;
}
- if (pnfs_layoutgets_blocked(lo, 1) ||
- pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+ if (pnfs_layoutgets_blocked(lo, 1)) {
dprintk("%s forget reply due to state\n", __func__);
goto out_forget_reply;
}
- /* Check that the new stateid matches the old stateid */
- pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
- /* Done processing layoutget. Set the layout stateid */
- pnfs_set_layout_stateid(lo, &res->stateid, false);
+ if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+ /* existing state ID, make sure the sequence number matches. */
+ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+ dprintk("%s forget reply due to sequence\n", __func__);
+ goto out_forget_reply;
+ }
+ pnfs_set_layout_stateid(lo, &res->stateid, false);
+ } else {
+ /*
+ * We got an entirely new state ID. Mark all segments for the
+ * inode invalid, and don't bother validating the stateid
+ * sequence number.
+ */
+ pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+
+ nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
+ lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
+ }
+
+ clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- init_lseg(lo, lseg);
- lseg->pls_range = res->range;
pnfs_get_lseg(lseg);
pnfs_layout_insert_lseg(lo, lseg);
@@ -1470,41 +1529,19 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
-int pnfs_write_done_resend_to_mds(struct inode *inode,
- struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops,
- struct nfs_direct_req *dreq)
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
{
struct nfs_pageio_descriptor pgio;
- LIST_HEAD(failed);
/* Resend all requests through the MDS */
- nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);
- pgio.pg_dreq = dreq;
- while (!list_empty(head)) {
- struct nfs_page *req = nfs_list_entry(head->next);
-
- nfs_list_remove_request(req);
- if (!nfs_pageio_add_request(&pgio, req))
- nfs_list_add_request(req, &failed);
- }
- nfs_pageio_complete(&pgio);
-
- if (!list_empty(&failed)) {
- /* For some reason our attempt to resend pages. Mark the
- * overall send request as having failed, and let
- * nfs_writeback_release_full deal with the error.
- */
- list_move(&failed, head);
- return -EIO;
- }
- return 0;
+ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
+ hdr->completion_ops);
+ return nfs_pageio_resend(&pgio, hdr);
}
EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
-static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
dprintk("pnfs write error = %d\n", hdr->pnfs_error);
if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
@@ -1512,50 +1549,42 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
pnfs_return_layout(hdr->inode);
}
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
- data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
- &hdr->pages,
- hdr->completion_ops,
- hdr->dreq);
+ hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
}
/*
* Called by non rpc-based layout drivers
*/
-void pnfs_ld_write_done(struct nfs_pgio_data *data)
+void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
- trace_nfs4_pnfs_write(data, hdr->pnfs_error);
+ trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
if (!hdr->pnfs_error) {
- pnfs_set_layoutcommit(data);
- hdr->mds_ops->rpc_call_done(&data->task, data);
+ pnfs_set_layoutcommit(hdr);
+ hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
} else
- pnfs_ld_handle_write_error(data);
- hdr->mds_ops->rpc_release(data);
+ pnfs_ld_handle_write_error(hdr);
+ hdr->mds_ops->rpc_release(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
static void
pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_data *data)
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
list_splice_tail_init(&hdr->pages, &desc->pg_list);
nfs_pageio_reset_write_mds(desc);
desc->pg_recoalesce = 1;
}
- nfs_pgio_data_release(data);
+ nfs_pgio_data_destroy(hdr);
}
static enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
+pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops,
struct pnfs_layout_segment *lseg,
int how)
{
- struct nfs_pgio_header *hdr = wdata->header;
struct inode *inode = hdr->inode;
enum pnfs_try_status trypnfs;
struct nfs_server *nfss = NFS_SERVER(inode);
@@ -1563,8 +1592,8 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
hdr->mds_ops = call_ops;
dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
- inode->i_ino, wdata->args.count, wdata->args.offset, how);
- trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+ inode->i_ino, hdr->args.count, hdr->args.offset, how);
+ trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
if (trypnfs != PNFS_NOT_ATTEMPTED)
nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1575,139 +1604,105 @@ static void
pnfs_do_write(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr, int how)
{
- struct nfs_pgio_data *data = hdr->data;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
enum pnfs_try_status trypnfs;
desc->pg_lseg = NULL;
- trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+ trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_write_through_mds(desc, data);
+ pnfs_write_through_mds(desc, hdr);
pnfs_put_lseg(lseg);
}
static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
{
pnfs_put_lseg(hdr->lseg);
- nfs_rw_header_free(hdr);
+ nfs_pgio_header_free(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_rw_header *whdr;
struct nfs_pgio_header *hdr;
int ret;
- whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
- if (!whdr) {
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
return -ENOMEM;
}
- hdr = &whdr->header;
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
- atomic_inc(&hdr->refcnt);
ret = nfs_generic_pgio(desc, hdr);
if (ret != 0) {
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
pnfs_do_write(desc, hdr, desc->pg_ioflags);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
return ret;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
-int pnfs_read_done_resend_to_mds(struct inode *inode,
- struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops,
- struct nfs_direct_req *dreq)
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
{
struct nfs_pageio_descriptor pgio;
- LIST_HEAD(failed);
/* Resend all requests through the MDS */
- nfs_pageio_init_read(&pgio, inode, true, compl_ops);
- pgio.pg_dreq = dreq;
- while (!list_empty(head)) {
- struct nfs_page *req = nfs_list_entry(head->next);
-
- nfs_list_remove_request(req);
- if (!nfs_pageio_add_request(&pgio, req))
- nfs_list_add_request(req, &failed);
- }
- nfs_pageio_complete(&pgio);
-
- if (!list_empty(&failed)) {
- list_move(&failed, head);
- return -EIO;
- }
- return 0;
+ nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
+ return nfs_pageio_resend(&pgio, hdr);
}
EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
-static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
dprintk("pnfs read error = %d\n", hdr->pnfs_error);
if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
PNFS_LAYOUTRET_ON_ERROR) {
pnfs_return_layout(hdr->inode);
}
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
- data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
- &hdr->pages,
- hdr->completion_ops,
- hdr->dreq);
+ hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
}
/*
* Called by non rpc-based layout drivers
*/
-void pnfs_ld_read_done(struct nfs_pgio_data *data)
+void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
- trace_nfs4_pnfs_read(data, hdr->pnfs_error);
+ trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
if (likely(!hdr->pnfs_error)) {
- __nfs4_read_done_cb(data);
- hdr->mds_ops->rpc_call_done(&data->task, data);
+ __nfs4_read_done_cb(hdr);
+ hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
} else
- pnfs_ld_handle_read_error(data);
- hdr->mds_ops->rpc_release(data);
+ pnfs_ld_handle_read_error(hdr);
+ hdr->mds_ops->rpc_release(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
static void
pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_data *data)
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
list_splice_tail_init(&hdr->pages, &desc->pg_list);
nfs_pageio_reset_read_mds(desc);
desc->pg_recoalesce = 1;
}
- nfs_pgio_data_release(data);
+ nfs_pgio_data_destroy(hdr);
}
/*
* Call the appropriate parallel I/O subsystem read function.
*/
static enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
+pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops,
struct pnfs_layout_segment *lseg)
{
- struct nfs_pgio_header *hdr = rdata->header;
struct inode *inode = hdr->inode;
struct nfs_server *nfss = NFS_SERVER(inode);
enum pnfs_try_status trypnfs;
@@ -1715,9 +1710,9 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
hdr->mds_ops = call_ops;
dprintk("%s: Reading ino:%lu %u@%llu\n",
- __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+ __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
- trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+ trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
if (trypnfs != PNFS_NOT_ATTEMPTED)
nfs_inc_stats(inode, NFSIOS_PNFS_READ);
dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1727,52 +1722,46 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
static void
pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_data *data = hdr->data;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
enum pnfs_try_status trypnfs;
desc->pg_lseg = NULL;
- trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+ trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_read_through_mds(desc, data);
+ pnfs_read_through_mds(desc, hdr);
pnfs_put_lseg(lseg);
}
static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
{
pnfs_put_lseg(hdr->lseg);
- nfs_rw_header_free(hdr);
+ nfs_pgio_header_free(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_rw_header *rhdr;
struct nfs_pgio_header *hdr;
int ret;
- rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
- if (!rhdr) {
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
ret = -ENOMEM;
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
return ret;
}
- hdr = &rhdr->header;
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
- atomic_inc(&hdr->refcnt);
ret = nfs_generic_pgio(desc, hdr);
if (ret != 0) {
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
pnfs_do_read(desc, hdr);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
return ret;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
@@ -1820,12 +1809,11 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
void
-pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
+pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = wdata->header;
struct inode *inode = hdr->inode;
struct nfs_inode *nfsi = NFS_I(inode);
- loff_t end_pos = wdata->mds_offset + wdata->res.count;
+ loff_t end_pos = hdr->mds_offset + hdr->res.count;
bool mark_as_dirty = false;
spin_lock(&inode->i_lock);
@@ -1851,6 +1839,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
}
EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
+{
+ struct inode *inode = data->inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ bool mark_as_dirty = false;
+
+ spin_lock(&inode->i_lock);
+ if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+ mark_as_dirty = true;
+ dprintk("%s: Set layoutcommit for inode %lu ",
+ __func__, inode->i_ino);
+ }
+ if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
+ /* references matched in nfs4_layoutcommit_release */
+ pnfs_get_lseg(data->lseg);
+ }
+ if (data->lwb > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = data->lwb;
+ spin_unlock(&inode->i_lock);
+ dprintk("%s: lseg %p end_pos %llu\n",
+ __func__, data->lseg, nfsi->layout->plh_lwb);
+
+ /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+ * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+ if (mark_as_dirty)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
+
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
{
struct nfs_server *nfss = NFS_SERVER(data->args.inode);
@@ -1871,6 +1888,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
int
pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
struct nfs4_layoutcommit_data *data;
struct nfs_inode *nfsi = NFS_I(inode);
loff_t end_pos;
@@ -1921,6 +1939,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
data->args.lastbytewritten = end_pos - 1;
data->res.server = NFS_SERVER(inode);
+ if (ld->prepare_layoutcommit) {
+ status = ld->prepare_layoutcommit(&data->args);
+ if (status) {
+ spin_lock(&inode->i_lock);
+ if (end_pos < nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
+ spin_unlock(&inode->i_lock);
+ put_rpccred(data->cred);
+ set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+ goto clear_layoutcommitting;
+ }
+ }
+
+
status = nfs4_proc_layoutcommit(data, sync);
out:
if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4fb309a2b4c..9ae5b765b07 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,6 +32,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
+#include <linux/workqueue.h>
enum {
NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
@@ -46,6 +47,7 @@ struct pnfs_layout_segment {
atomic_t pls_refcount;
unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
+ struct work_struct pls_work;
};
enum pnfs_try_status {
@@ -63,12 +65,15 @@ enum {
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_RETURN, /* Return this layout ASAP */
+ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
};
enum layoutdriver_policy_flags {
- /* Should the pNFS client commit and return the layout upon a setattr */
+ /* Should the pNFS client commit and return the layout upon truncate to
+ * a smaller size */
PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
+ PNFS_READ_WHOLE_PAGE = 1 << 2,
};
struct nfs4_deviceid_node;
@@ -80,6 +85,7 @@ struct pnfs_layoutdriver_type {
const char *name;
struct module *owner;
unsigned flags;
+ unsigned max_deviceinfo_size;
int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
int (*clear_layoutdriver) (struct nfs_server *);
@@ -90,6 +96,9 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
+ void (*return_range) (struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range);
+
/* test for nfs page cache coalescing */
const struct nfs_pageio_ops *pg_read_ops;
const struct nfs_pageio_ops *pg_write_ops;
@@ -104,6 +113,8 @@ struct pnfs_layoutdriver_type {
int max);
void (*recover_commit_reqs) (struct list_head *list,
struct nfs_commit_info *cinfo);
+ struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+ struct page *page);
int (*commit_pagelist)(struct inode *inode,
struct list_head *mds_pages,
int how,
@@ -113,18 +124,21 @@ struct pnfs_layoutdriver_type {
* Return PNFS_ATTEMPTED to indicate the layout code has attempted
* I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
*/
- enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data);
- enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);
+ enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
+ enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+ struct nfs4_deviceid_node * (*alloc_deviceid_node)
+ (struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags);
void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args);
void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
-
- void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
+ void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);
};
@@ -167,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
/* nfs4proc.c */
-extern int nfs4_proc_getdevicelist(struct nfs_server *server,
- const struct nfs_fh *fh,
- struct pnfs_devicelist *devlist);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
struct rpc_cred *cred);
@@ -179,6 +190,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
/* pnfs.c */
void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
+void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -213,13 +225,14 @@ bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);
+void pnfs_set_layoutcommit(struct nfs_pgio_header *);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
-void pnfs_ld_write_done(struct nfs_pgio_data *);
-void pnfs_ld_read_done(struct nfs_pgio_data *);
+void pnfs_ld_write_done(struct nfs_pgio_header *);
+void pnfs_ld_read_done(struct nfs_pgio_header *);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
@@ -228,12 +241,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
gfp_t gfp_flags);
void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
-int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops,
- struct nfs_direct_req *dreq);
-int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops,
- struct nfs_direct_req *dreq);
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
/* nfs4_deviceid_flags */
@@ -254,18 +263,25 @@ struct nfs4_deviceid_node {
atomic_t ref;
};
-struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ gfp_t gfp_mask);
void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
-void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
- const struct pnfs_layoutdriver_type *,
- const struct nfs_client *,
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
const struct nfs4_deviceid *);
-struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+ atomic_inc(&d->ref);
+ return d;
+}
+
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
@@ -345,6 +361,17 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
}
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+ struct page *page)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld == NULL || ld->search_commit_reqs == NULL)
+ return NULL;
+ return ld->search_commit_reqs(cinfo, page);
+}
+
/* Should the pNFS client commit and return the layout upon a setattr */
static inline bool
pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -356,6 +383,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
}
static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
+}
+
+static inline bool
pnfs_layoutcommit_outstanding(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -427,6 +462,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
}
static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ return false;
+}
+
+static inline bool
pnfs_roc(struct inode *ino)
{
return false;
@@ -496,6 +537,13 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
{
}
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+ struct page *page)
+{
+ return NULL;
+}
+
static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
return 0;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd940..aa2ec001518 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
*/
#include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
return NULL;
}
+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+ const struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred, gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d = NULL;
+ struct pnfs_device *pdev = NULL;
+ struct page **pages = NULL;
+ u32 max_resp_sz;
+ int max_pages;
+ int rc, i;
+
+ /*
+ * Use the session max response size as the basis for setting
+ * GETDEVICEINFO's maxcount
+ */
+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ if (server->pnfs_curr_ld->max_deviceinfo_size &&
+ server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
+ max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
+ max_pages = nfs_page_array_len(0, max_resp_sz);
+ dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+ __func__, server, max_resp_sz, max_pages);
+
+ pdev = kzalloc(sizeof(*pdev), gfp_flags);
+ if (!pdev)
+ return NULL;
+
+ pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+ if (!pages)
+ goto out_free_pdev;
+
+ for (i = 0; i < max_pages; i++) {
+ pages[i] = alloc_page(gfp_flags);
+ if (!pages[i])
+ goto out_free_pages;
+ }
+
+ memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+ pdev->layout_type = server->pnfs_curr_ld->id;
+ pdev->pages = pages;
+ pdev->pgbase = 0;
+ pdev->pglen = max_resp_sz;
+ pdev->mincount = 0;
+ pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+
+ rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+ dprintk("%s getdevice info returns %d\n", __func__, rc);
+ if (rc)
+ goto out_free_pages;
+
+ /*
+ * Found new device, need to decode it and then add it to the
+ * list of known devices for this mountpoint.
+ */
+ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+ gfp_flags);
+
+out_free_pages:
+ for (i = 0; i < max_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+out_free_pdev:
+ kfree(pdev);
+ dprintk("<-- %s d %p\n", __func__, d);
+ return d;
+}
+
/*
* Lookup a deviceid in cache and get a reference count on it if found
*
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
* @id deviceid to look up
*/
static struct nfs4_deviceid_node *
-_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *clp, const struct nfs4_deviceid *id,
- long hash)
+__nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, long hash)
{
struct nfs4_deviceid_node *d;
rcu_read_lock();
- d = _lookup_deviceid(ld, clp, id, hash);
+ d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+ hash);
if (d != NULL)
atomic_inc(&d->ref);
rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
}
struct nfs4_deviceid_node *
-nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *clp, const struct nfs4_deviceid *id)
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ gfp_t gfp_mask)
{
- return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+ long hash = nfs4_deviceid_hash(id);
+ struct nfs4_deviceid_node *d, *new;
+
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d)
+ return d;
+
+ new = nfs4_get_device_info(server, id, cred, gfp_mask);
+ if (!new)
+ return new;
+
+ spin_lock(&nfs4_deviceid_lock);
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ server->pnfs_curr_ld->free_deviceid_node(new);
+ return d;
+ }
+ hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+ atomic_inc(&new->ref);
+ spin_unlock(&nfs4_deviceid_lock);
+
+ return new;
}
EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
@@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
void
-nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
- const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *nfs_client,
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
const struct nfs4_deviceid *id)
{
INIT_HLIST_NODE(&d->node);
INIT_HLIST_NODE(&d->tmpnode);
- d->ld = ld;
- d->nfs_client = nfs_client;
+ d->ld = server->pnfs_curr_ld;
+ d->nfs_client = server->nfs_client;
d->flags = 0;
d->deviceid = *id;
atomic_set(&d->ref, 1);
@@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
/*
- * Uniquely initialize and insert a deviceid node into cache
- *
- * @new new deviceid node
- * Note that the caller must set up the following members:
- * new->ld
- * new->nfs_client
- * new->deviceid
- *
- * @ret the inserted node, if none found, otherwise, the found entry.
- */
-struct nfs4_deviceid_node *
-nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
-{
- struct nfs4_deviceid_node *d;
- long hash;
-
- spin_lock(&nfs4_deviceid_lock);
- hash = nfs4_deviceid_hash(&new->deviceid);
- d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
- if (d) {
- spin_unlock(&nfs4_deviceid_lock);
- return d;
- }
-
- hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
- spin_unlock(&nfs4_deviceid_lock);
- atomic_inc(&new->ref);
-
- return new;
-}
-EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
-
-/*
* Dereference a deviceid node and delete it when its reference count drops
* to zero.
*
@@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
}
rcu_read_unlock();
}
-
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c171ce1a8a3..b09cc23d6f4 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -578,46 +578,49 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return 0;
}
-static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
nfs_invalidate_atime(inode);
if (task->tk_status >= 0) {
- nfs_refresh_inode(inode, data->res.fattr);
+ nfs_refresh_inode(inode, hdr->res.fattr);
/* Emulate the eof flag, which isn't normally needed in NFSv2
* as it is guaranteed to always return the file attributes
*/
- if (data->args.offset + data->res.count >= data->res.fattr->size)
- data->res.eof = 1;
+ if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+ hdr->res.eof = 1;
}
return 0;
}
-static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
}
-static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
rpc_call_start(task);
return 0;
}
-static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
if (task->tk_status >= 0)
- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+ nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
return 0;
}
-static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
- data->args.stable = NFS_FILE_SYNC;
+ hdr->args.stable = NFS_FILE_SYNC;
msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e818a475ca6..beff2769c5c 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -33,12 +33,12 @@ static const struct nfs_rw_ops nfs_rw_read_ops;
static struct kmem_cache *nfs_rdata_cachep;
-static struct nfs_rw_header *nfs_readhdr_alloc(void)
+static struct nfs_pgio_header *nfs_readhdr_alloc(void)
{
return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
}
-static void nfs_readhdr_free(struct nfs_rw_header *rhdr)
+static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
{
kmem_cache_free(nfs_rdata_cachep, rhdr);
}
@@ -115,12 +115,6 @@ static void nfs_readpage_release(struct nfs_page *req)
unlock_page(req->wb_page);
}
-
- dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
- req->wb_context->dentry->d_inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
- req->wb_bytes,
- (long long)req_offset(req));
nfs_release_request(req);
}
@@ -172,14 +166,15 @@ out:
hdr->release(hdr);
}
-static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_read(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
struct rpc_task_setup *task_setup_data, int how)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
task_setup_data->flags |= swap_flags;
- NFS_PROTO(inode)->read_setup(data, msg);
+ NFS_PROTO(inode)->read_setup(hdr, msg);
}
static void
@@ -203,14 +198,15 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
* This is the callback from RPC telling us whether a reply was
* received or some error occurred (timeout or socket shutdown).
*/
-static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_readpage_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr,
struct inode *inode)
{
- int status = NFS_PROTO(inode)->read_done(task, data);
+ int status = NFS_PROTO(inode)->read_done(task, hdr);
if (status != 0)
return status;
- nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count);
+ nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
if (task->tk_status == -ESTALE) {
set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
@@ -219,34 +215,34 @@ static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
return 0;
}
-static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_retry(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_args *argp = &data->args;
- struct nfs_pgio_res *resp = &data->res;
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
/* This is a short read! */
- nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
+ nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
/* Has the server at least made some progress? */
if (resp->count == 0) {
- nfs_set_pgio_error(data->header, -EIO, argp->offset);
+ nfs_set_pgio_error(hdr, -EIO, argp->offset);
return;
}
- /* Yes, so retry the read at the end of the data */
- data->mds_offset += resp->count;
+ /* Yes, so retry the read at the end of the hdr */
+ hdr->mds_offset += resp->count;
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
rpc_restart_call_prepare(task);
}
-static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_result(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
- if (data->res.eof) {
+ if (hdr->res.eof) {
loff_t bound;
- bound = data->args.offset + data->res.count;
+ bound = hdr->args.offset + hdr->res.count;
spin_lock(&hdr->lock);
if (bound < hdr->io_start + hdr->good_bytes) {
set_bit(NFS_IOHDR_EOF, &hdr->flags);
@@ -254,8 +250,8 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *dat
hdr->good_bytes = bound - hdr->io_start;
}
spin_unlock(&hdr->lock);
- } else if (data->res.count != data->args.count)
- nfs_readpage_retry(task, data);
+ } else if (hdr->res.count != hdr->args.count)
+ nfs_readpage_retry(task, hdr);
}
/*
@@ -404,7 +400,7 @@ out:
int __init nfs_init_readpagecache(void)
{
nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
- sizeof(struct nfs_rw_header),
+ sizeof(struct nfs_pgio_header),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (nfs_rdata_cachep == NULL)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 084af1060d7..31a11b0e885 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1027,8 +1027,7 @@ static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
rpc_authflavor_t flavor)
{
unsigned int i;
- unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
- sizeof(auth_info->flavors[0]));
+ unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
/* make sure this flavor isn't already in the list */
for (i = 0; i < auth_info->flavor_len; i++) {
@@ -2066,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
return NFS_TEXT_DATA;
}
-#if !IS_ENABLED(CONFIG_NFS_V3)
- if (args->version == 3)
- goto out_v3_not_compiled;
-#endif /* !CONFIG_NFS_V3 */
-
return 0;
out_no_data:
@@ -2086,12 +2080,6 @@ out_no_sec:
dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
return -EINVAL;
-#if !IS_ENABLED(CONFIG_NFS_V3)
-out_v3_not_compiled:
- dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
- return -EPROTONOSUPPORT;
-#endif /* !CONFIG_NFS_V3 */
-
out_nomem:
dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
return -ENOMEM;
@@ -2180,7 +2168,7 @@ out_no_address:
return -EINVAL;
}
-#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
| NFS_MOUNT_SECURE \
| NFS_MOUNT_TCP \
| NFS_MOUNT_VER3 \
@@ -2188,15 +2176,16 @@ out_no_address:
| NFS_MOUNT_NONLM \
| NFS_MOUNT_BROKEN_SUID \
| NFS_MOUNT_STRICTLOCK \
- | NFS_MOUNT_UNSHARED \
- | NFS_MOUNT_NORESVPORT \
| NFS_MOUNT_LEGACY_INTERFACE)
+#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
+ ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
+
static int
nfs_compare_remount_data(struct nfs_server *nfss,
struct nfs_parsed_mount_data *data)
{
- if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||
+ if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
data->rsize != nfss->rsize ||
data->wsize != nfss->wsize ||
data->version != nfss->nfs_client->rpc_ops->version ||
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 962c9ee758b..f83b02dc916 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -47,6 +47,11 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
static const struct nfs_rw_ops nfs_rw_write_ops;
static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+ struct inode *inode);
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct page *page);
static struct kmem_cache *nfs_wdata_cachep;
static mempool_t *nfs_wdata_mempool;
@@ -71,18 +76,18 @@ void nfs_commit_free(struct nfs_commit_data *p)
}
EXPORT_SYMBOL_GPL(nfs_commit_free);
-static struct nfs_rw_header *nfs_writehdr_alloc(void)
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
{
- struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+ struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
if (p)
memset(p, 0, sizeof(*p));
return p;
}
-static void nfs_writehdr_free(struct nfs_rw_header *whdr)
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
{
- mempool_free(whdr, nfs_wdata_mempool);
+ mempool_free(hdr, nfs_wdata_mempool);
}
static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -106,21 +111,12 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
if (PagePrivate(page))
req = (struct nfs_page *)page_private(page);
- else if (unlikely(PageSwapCache(page))) {
- struct nfs_page *freq, *t;
-
- /* Linearly search the commit list for the correct req */
- list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
- if (freq->wb_page == page) {
- req = freq->wb_head;
- break;
- }
- }
- }
+ else if (unlikely(PageSwapCache(page)))
+ req = nfs_page_search_commits_for_head_request_locked(nfsi,
+ page);
if (req) {
WARN_ON_ONCE(req->wb_head != req);
-
kref_get(&req->wb_kref);
}
@@ -216,7 +212,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
unsigned int pos = 0;
unsigned int len = nfs_page_length(req->wb_page);
- nfs_page_group_lock(req);
+ nfs_page_group_lock(req, false);
do {
tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -246,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
static int wb_priority(struct writeback_control *wbc)
{
+ int ret = 0;
if (wbc->for_reclaim)
return FLUSH_HIGHPRI | FLUSH_STABLE;
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ ret = FLUSH_COND_STABLE;
if (wbc->for_kupdate || wbc->for_background)
- return FLUSH_LOWPRI | FLUSH_COND_STABLE;
- return FLUSH_COND_STABLE;
+ ret |= FLUSH_LOWPRI;
+ return ret;
}
/*
@@ -379,8 +378,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
subreq->wb_head = subreq;
subreq->wb_this_page = subreq;
- nfs_clear_request_commit(subreq);
-
/* subreq is now totally disconnected from page group or any
* write / commit lists. last chance to wake any waiters */
nfs_unlock_request(subreq);
@@ -455,8 +452,23 @@ try_again:
return NULL;
}
+ /* holding inode lock, so always make a non-blocking call to try the
+ * page group lock */
+ ret = nfs_page_group_lock(head, true);
+ if (ret < 0) {
+ spin_unlock(&inode->i_lock);
+
+ if (!nonblock && ret == -EAGAIN) {
+ nfs_page_group_lock_wait(head);
+ nfs_release_request(head);
+ goto try_again;
+ }
+
+ nfs_release_request(head);
+ return ERR_PTR(ret);
+ }
+
/* lock each request in the page group */
- nfs_page_group_lock(head);
subreq = head;
do {
/*
@@ -488,7 +500,7 @@ try_again:
* Commit list removal accounting is done after locks are dropped */
subreq = head;
do {
- nfs_list_remove_request(subreq);
+ nfs_clear_request_commit(subreq);
subreq = subreq->wb_this_page;
} while (subreq != head);
@@ -518,15 +530,11 @@ try_again:
nfs_page_group_unlock(head);
- /* drop lock to clear_request_commit the head req and clean up
- * requests on destroy list */
+ /* drop lock to clean uprequests on destroy list */
spin_unlock(&inode->i_lock);
nfs_destroy_unlinked_subrequests(destroy_list, head);
- /* clean up commit list state */
- nfs_clear_request_commit(head);
-
/* still holds ref on head from nfs_page_find_head_request_locked
* and still has lock on head from lock loop */
return head;
@@ -697,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
if (likely(!PageSwapCache(head->wb_page))) {
set_page_private(head->wb_page, 0);
ClearPagePrivate(head->wb_page);
+ smp_mb__after_atomic();
+ wake_up_page(head->wb_page, PG_private);
clear_bit(PG_MAPPED, &head->wb_flags);
}
nfsi->npages--;
@@ -713,7 +723,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
__set_page_dirty_nobuffers(req->wb_page);
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct page *page)
+{
+ struct nfs_page *freq, *t;
+ struct nfs_commit_info cinfo;
+ struct inode *inode = &nfsi->vfs_inode;
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+
+ /* search through pnfs commit lists */
+ freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+ if (freq)
+ return freq->wb_head;
+
+ /* Linearly search the commit list for the correct request */
+ list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+ if (freq->wb_page == page)
+ return freq->wb_head;
+ }
+
+ return NULL;
+}
+
/**
* nfs_request_add_commit_list - add request to a commit list
* @req: pointer to a struct nfs_page
@@ -808,6 +849,7 @@ nfs_clear_page_commit(struct page *page)
dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
}
+/* Called holding inode (/cinfo) lock */
static void
nfs_clear_request_commit(struct nfs_page *req)
{
@@ -817,53 +859,19 @@ nfs_clear_request_commit(struct nfs_page *req)
nfs_init_cinfo_from_inode(&cinfo, inode);
if (!pnfs_clear_request_commit(req, &cinfo)) {
- spin_lock(cinfo.lock);
nfs_request_remove_commit_list(req, &cinfo);
- spin_unlock(cinfo.lock);
}
nfs_clear_page_commit(req->wb_page);
}
}
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
-{
- if (data->verf.committed == NFS_DATA_SYNC)
- return data->header->lseg == NULL;
- return data->verf.committed != NFS_FILE_SYNC;
-}
-
-#else
-static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
- struct inode *inode)
-{
-}
-
-void nfs_init_cinfo(struct nfs_commit_info *cinfo,
- struct inode *inode,
- struct nfs_direct_req *dreq)
-{
-}
-
-void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
-{
-}
-
-static void
-nfs_clear_request_commit(struct nfs_page *req)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
{
+ if (hdr->verf.committed == NFS_DATA_SYNC)
+ return hdr->lseg == NULL;
+ return hdr->verf.committed != NFS_FILE_SYNC;
}
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
-{
- return 0;
-}
-
-#endif
-
static void nfs_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_commit_info cinfo;
@@ -883,11 +891,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
nfs_context_set_write_error(req->wb_context, hdr->error);
goto remove_req;
}
- if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
- nfs_mark_request_dirty(req);
- goto next;
- }
- if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+ if (nfs_write_need_commit(hdr)) {
memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
nfs_mark_request_commit(req, hdr->lseg, &cinfo);
goto next;
@@ -903,7 +907,6 @@ out:
hdr->release(hdr);
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
unsigned long
nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
{
@@ -960,19 +963,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
return ret;
}
-#else
-unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
-{
- return 0;
-}
-
-int nfs_scan_commit(struct inode *inode, struct list_head *dst,
- struct nfs_commit_info *cinfo)
-{
- return 0;
-}
-#endif
-
/*
* Search for an existing write request, and attempt to update
* it to reflect a new dirty region on a given page.
@@ -1038,9 +1028,9 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
else
req->wb_bytes = rqend - req->wb_offset;
out_unlock:
- spin_unlock(&inode->i_lock);
if (req)
nfs_clear_request_commit(req);
+ spin_unlock(&inode->i_lock);
return req;
out_flushme:
spin_unlock(&inode->i_lock);
@@ -1241,17 +1231,18 @@ static int flush_task_priority(int how)
return RPC_PRIORITY_NORMAL;
}
-static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
struct rpc_task_setup *task_setup_data, int how)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
int priority = flush_task_priority(how);
task_setup_data->priority = priority;
- NFS_PROTO(inode)->write_setup(data, msg);
+ NFS_PROTO(inode)->write_setup(hdr, msg);
nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
- &task_setup_data->rpc_client, msg, data);
+ &task_setup_data->rpc_client, msg, hdr);
}
/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1313,21 +1304,9 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
}
-static void nfs_writeback_release_common(struct nfs_pgio_data *data)
+static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
- int status = data->task.tk_status;
-
- if ((status >= 0) && nfs_write_need_commit(data)) {
- spin_lock(&hdr->lock);
- if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
- ; /* Do nothing */
- else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
- memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
- else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
- set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
- spin_unlock(&hdr->lock);
- }
+ /* do nothing! */
}
/*
@@ -1358,7 +1337,8 @@ static int nfs_should_remove_suid(const struct inode *inode)
/*
* This function is called when the WRITE call is complete.
*/
-static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_writeback_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr,
struct inode *inode)
{
int status;
@@ -1370,13 +1350,13 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
* another writer had changed the file, but some applications
* depend on tighter cache coherency when writing.
*/
- status = NFS_PROTO(inode)->write_done(task, data);
+ status = NFS_PROTO(inode)->write_done(task, hdr);
if (status != 0)
return status;
- nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
+ nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
- if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
+ if (hdr->res.verf->committed < hdr->args.stable &&
+ task->tk_status >= 0) {
/* We tried a write call, but the server did not
* commit data to stable storage even though we
* requested it.
@@ -1392,11 +1372,10 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
NFS_SERVER(inode)->nfs_client->cl_hostname,
- data->res.verf->committed, data->args.stable);
+ hdr->res.verf->committed, hdr->args.stable);
complain = jiffies + 300 * HZ;
}
}
-#endif
/* Deal with the suid/sgid bit corner case */
if (nfs_should_remove_suid(inode))
@@ -1407,16 +1386,17 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
/*
* This function is called when the WRITE call is complete.
*/
-static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_writeback_result(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_args *argp = &data->args;
- struct nfs_pgio_res *resp = &data->res;
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
if (resp->count < argp->count) {
static unsigned long complain;
/* This a short write! */
- nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
+ nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
/* Has the server at least made some progress? */
if (resp->count == 0) {
@@ -1426,14 +1406,14 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
argp->count);
complain = jiffies + 300 * HZ;
}
- nfs_set_pgio_error(data->header, -EIO, argp->offset);
+ nfs_set_pgio_error(hdr, -EIO, argp->offset);
task->tk_status = -EIO;
return;
}
/* Was this an NFSv2 write or an NFSv3 stable write? */
if (resp->verf->committed != NFS_UNSTABLE) {
/* Resend from where the server left off */
- data->mds_offset += resp->count;
+ hdr->mds_offset += resp->count;
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
@@ -1448,7 +1428,6 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
{
int ret;
@@ -1517,6 +1496,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
}
EXPORT_SYMBOL_GPL(nfs_initiate_commit);
+static loff_t nfs_get_lwb(struct list_head *head)
+{
+ loff_t lwb = 0;
+ struct nfs_page *req;
+
+ list_for_each_entry(req, head, wb_list)
+ if (lwb < (req_offset(req) + req->wb_bytes))
+ lwb = req_offset(req) + req->wb_bytes;
+
+ return lwb;
+}
+
/*
* Set up the argument/result storage required for the RPC call.
*/
@@ -1536,6 +1527,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
data->inode = inode;
data->cred = first->wb_context->cred;
data->lseg = lseg; /* reference transferred */
+ /* only set lwb for pnfs commit */
+ if (lseg)
+ data->lwb = nfs_get_lwb(&data->pages);
data->mds_ops = &nfs_commit_ops;
data->completion_ops = cinfo->completion_ops;
data->dreq = cinfo->dreq;
@@ -1615,6 +1609,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
struct nfs_page *req;
int status = data->task.tk_status;
struct nfs_commit_info cinfo;
+ struct nfs_server *nfss;
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
@@ -1648,6 +1643,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
next:
nfs_unlock_and_release_request(req);
}
+ nfss = NFS_SERVER(data->inode);
+ if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+ clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1757,12 +1756,6 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
-#else
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
-{
- return 0;
-}
-#endif
int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
@@ -1884,7 +1877,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
int __init nfs_init_writepagecache(void)
{
nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
- sizeof(struct nfs_rw_header),
+ sizeof(struct nfs_pgio_header),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (nfs_wdata_cachep == NULL)