From eeff66ef6e33925f615d49e6c846263e342ab60e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Mar 2011 16:39:47 +0530 Subject: net/9p: Convert the in the 9p rpc call path to GFP_NOFS Without this we can cause reclaim allocation in writepage. [ 3433.448430] ================================= [ 3433.449117] [ INFO: inconsistent lock state ] [ 3433.449117] 2.6.38-rc5+ #84 [ 3433.449117] --------------------------------- [ 3433.449117] inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-R} usage. [ 3433.449117] kswapd0/505 [HC0[0]:SC0[0]:HE1:SE1] takes: [ 3433.449117] (iprune_sem){+++++-}, at: [] shrink_icache_memory+0x45/0x2b1 [ 3433.449117] {RECLAIM_FS-ON-W} state was registered at: [ 3433.449117] [] mark_held_locks+0x52/0x70 [ 3433.449117] [] lockdep_trace_alloc+0x85/0x9f [ 3433.449117] [] slab_pre_alloc_hook+0x18/0x3c [ 3433.449117] [] kmem_cache_alloc+0x23/0xa2 [ 3433.449117] [] idr_pre_get+0x2d/0x6f [ 3433.449117] [] p9_idpool_get+0x30/0xae [ 3433.449117] [] p9_client_rpc+0xd7/0x9b0 [ 3433.449117] [] p9_client_clunk+0x88/0xdb [ 3433.449117] [] v9fs_evict_inode+0x3c/0x48 [ 3433.449117] [] evict+0x1f/0x87 [ 3433.449117] [] dispose_list+0x47/0xe3 [ 3433.449117] [] evict_inodes+0x138/0x14f [ 3433.449117] [] generic_shutdown_super+0x57/0xe8 [ 3433.449117] [] kill_anon_super+0x11/0x50 [ 3433.449117] [] v9fs_kill_super+0x49/0xab [ 3433.449117] [] deactivate_locked_super+0x21/0x46 [ 3433.449117] [] deactivate_super+0x40/0x44 [ 3433.449117] [] mntput_no_expire+0x100/0x109 [ 3433.449117] [] sys_umount+0x2f1/0x31c [ 3433.449117] [] system_call_fastpath+0x16/0x1b [ 3433.449117] irq event stamp: 192941 [ 3433.449117] hardirqs last enabled at (192941): [] _raw_spin_unlock_irq+0x2b/0x30 [ 3433.449117] hardirqs last disabled at (192940): [] shrink_inactive_list+0x290/0x2f5 [ 3433.449117] softirqs last enabled at (188470): [] __do_softirq+0x133/0x152 [ 3433.449117] softirqs last disabled at (188455): [] call_softirq+0x1c/0x28 [ 3433.449117] [ 3433.449117] other info that might help us debug this: [ 3433.449117] 1 lock held by kswapd0/505: [ 3433.449117] #0: (shrinker_rwsem){++++..}, at: [] shrink_slab+0x38/0x15f [ 3433.449117] [ 3433.449117] stack backtrace: [ 3433.449117] Pid: 505, comm: kswapd0 Not tainted 2.6.38-rc5+ #84 [ 3433.449117] Call Trace: [ 3433.449117] [] ? valid_state+0x17e/0x191 [ 3433.449117] [] ? save_stack_trace+0x28/0x45 [ 3433.449117] [] ? check_usage_forwards+0x0/0x87 [ 3433.449117] [] ? mark_lock+0x113/0x22c [ 3433.449117] [] ? __lock_acquire+0x37a/0xcf7 [ 3433.449117] [] ? mark_lock+0x2d/0x22c [ 3433.449117] [] ? __lock_acquire+0x392/0xcf7 [ 3433.449117] [] ? determine_dirtyable_memory+0x15/0x28 [ 3433.449117] [] ? lock_acquire+0x57/0x6d [ 3433.449117] [] ? shrink_icache_memory+0x45/0x2b1 [ 3433.449117] [] ? down_read+0x47/0x5c [ 3433.449117] [] ? shrink_icache_memory+0x45/0x2b1 [ 3433.449117] [] ? shrink_icache_memory+0x45/0x2b1 [ 3433.449117] [] ? shrink_slab+0xdb/0x15f [ 3433.449117] [] ? kswapd+0x574/0x96a [ 3433.449117] [] ? kswapd+0x0/0x96a [ 3433.449117] [] ? kthread+0x7d/0x85 [ 3433.449117] [] ? kernel_thread_helper+0x4/0x10 [ 3433.449117] [] ? restore_args+0x0/0x30 [ 3433.449117] [] ? kthread+0x0/0x85 [ 3433.449117] [] ? kernel_thread_helper+0x0/0x10 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 10 +++++----- net/9p/protocol.c | 6 +++--- net/9p/trans_fd.c | 2 +- net/9p/trans_rdma.c | 6 +++--- net/9p/util.c | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 347ec0cd271..2ccbf04d37d 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -223,7 +223,7 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) req = &c->reqs[row][col]; if (!req->tc) { - req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); + req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS); if (!req->wq) { printk(KERN_ERR "Couldn't grow tag array\n"); return ERR_PTR(-ENOMEM); @@ -233,17 +233,17 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) P9_TRANS_PREF_PAYLOAD_SEP) { int alloc_msize = min(c->msize, 4096); req->tc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_KERNEL); + GFP_NOFS); req->tc->capacity = alloc_msize; req->rc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_KERNEL); + GFP_NOFS); req->rc->capacity = alloc_msize; } else { req->tc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_KERNEL); + GFP_NOFS); req->tc->capacity = c->msize; req->rc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_KERNEL); + GFP_NOFS); req->rc->capacity = c->msize; } if ((!req->tc) || (!req->rc)) { diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 2ce515b859b..8a4084fa8b5 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -205,7 +205,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, if (errcode) break; - *sptr = kmalloc(len + 1, GFP_KERNEL); + *sptr = kmalloc(len + 1, GFP_NOFS); if (*sptr == NULL) { errcode = -EFAULT; break; @@ -273,7 +273,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, if (!errcode) { *wnames = kmalloc(sizeof(char *) * *nwname, - GFP_KERNEL); + GFP_NOFS); if (!*wnames) errcode = -ENOMEM; } @@ -317,7 +317,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, *wqids = kmalloc(*nwqid * sizeof(struct p9_qid), - GFP_KERNEL); + GFP_NOFS); if (*wqids == NULL) errcode = -ENOMEM; } diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index a30471e5174..aa5672b15ea 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -350,7 +350,7 @@ static void p9_read_work(struct work_struct *work) if (m->req->rc == NULL) { m->req->rc = kmalloc(sizeof(struct p9_fcall) + - m->client->msize, GFP_KERNEL); + m->client->msize, GFP_NOFS); if (!m->req->rc) { m->req = NULL; err = -ENOMEM; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 29a54ccd213..150e0c4bbf4 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -424,7 +424,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) struct p9_rdma_context *rpl_context = NULL; /* Allocate an fcall for the reply */ - rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); + rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) { err = -ENOMEM; goto err_close; @@ -437,7 +437,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) */ if (!req->rc) { req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, - GFP_KERNEL); + GFP_NOFS); if (req->rc) { req->rc->sdata = (char *) req->rc + sizeof(struct p9_fcall); @@ -468,7 +468,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) req->rc = NULL; /* Post the request */ - c = kmalloc(sizeof *c, GFP_KERNEL); + c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { err = -ENOMEM; goto err_free1; diff --git a/net/9p/util.c b/net/9p/util.c index e048701a72d..b84619b5ba2 100644 --- a/net/9p/util.c +++ b/net/9p/util.c @@ -92,7 +92,7 @@ int p9_idpool_get(struct p9_idpool *p) unsigned long flags; retry: - if (idr_pre_get(&p->pool, GFP_KERNEL) == 0) + if (idr_pre_get(&p->pool, GFP_NOFS) == 0) return 0; spin_lock_irqsave(&p->lock, flags); -- cgit v1.2.3-70-g09d2 From 472e7f9f8b547605ee9670ac803e971c2e3eeac0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Mar 2011 16:39:47 +0530 Subject: net/9p: Fix compile warning Signed-off-by: Aneesh Kumar K.V Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/trans_common.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index d62b9aa58df..9172ab78fcb 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -41,9 +41,9 @@ EXPORT_SYMBOL(p9_release_req_pages); int p9_nr_pages(struct p9_req_t *req) { - int start_page, end_page; - start_page = (unsigned long long)req->tc->pubuf >> PAGE_SHIFT; - end_page = ((unsigned long long)req->tc->pubuf + req->tc->pbuf_size + + unsigned long start_page, end_page; + start_page = (unsigned long)req->tc->pubuf >> PAGE_SHIFT; + end_page = ((unsigned long)req->tc->pubuf + req->tc->pbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; return end_page - start_page; } @@ -69,8 +69,8 @@ p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len, *pdata_off = (size_t)req->tc->pubuf & (PAGE_SIZE-1); if (*pdata_off) - first_page_bytes = min((PAGE_SIZE - *pdata_off), - req->tc->pbuf_size); + first_page_bytes = min(((size_t)PAGE_SIZE - *pdata_off), + req->tc->pbuf_size); rpinfo = req->tc->private; pdata_mapped_pages = get_user_pages_fast((unsigned long)req->tc->pubuf, -- cgit v1.2.3-70-g09d2 From 53bda3e5b4e91763224ecb7d05dab94d281fd41d Mon Sep 17 00:00:00 2001 From: "Venkateswararao Jujjuri (JV)" Date: Tue, 8 Mar 2011 15:34:20 -0800 Subject: [net/9p] unconditional wake_up to proc waiting for space on VirtIO ring Process may wait to get space on VirtIO ring to send a transaction to VirtFS server. Current code just does a conditional wake_up() which means only one process will be woken up even if multiple processes are waiting. This fix makes the wake_up unconditional. Hence we won't have any processes waiting for-ever. Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/trans_virtio.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 9b550ed9c71..961e025957a 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -146,11 +146,10 @@ static void req_done(struct virtqueue *vq) rc = virtqueue_get_buf(chan->vq, &len); if (rc != NULL) { - if (!chan->ring_bufs_avail) { - chan->ring_bufs_avail = 1; - wake_up(chan->vc_wq); - } + chan->ring_bufs_avail = 1; spin_unlock_irqrestore(&chan->lock, flags); + /* Wakeup if anyone waiting for VirtIO ring space. */ + wake_up(chan->vc_wq); P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); -- cgit v1.2.3-70-g09d2 From a01a984035ea799b14aa5e874dcaeb122f09c4b4 Mon Sep 17 00:00:00 2001 From: "Venkateswararao Jujjuri (JV)" Date: Mon, 14 Mar 2011 14:12:49 -0700 Subject: [net/9p] Set the condition just before waking up. Given that the sprious wake-ups are common, we need to move the condition setting right next to the wake_up(). After setting the condition to req->status = REQ_STATUS_RCVD, sprious wakeups may cause the virtqueue back on the free list for someone else to use. This may result in kernel panic while relasing the pinned pages in p9_release_req_pages(). Also rearranged the while loop in req_done() for better redability. Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/trans_virtio.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 961e025957a..cb98af9367a 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -141,33 +141,33 @@ static void req_done(struct virtqueue *vq) P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n"); - do { + while (1) { spin_lock_irqsave(&chan->lock, flags); rc = virtqueue_get_buf(chan->vq, &len); - if (rc != NULL) { - chan->ring_bufs_avail = 1; - spin_unlock_irqrestore(&chan->lock, flags); - /* Wakeup if anyone waiting for VirtIO ring space. */ - wake_up(chan->vc_wq); - P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); - P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", - rc->tag); - req = p9_tag_lookup(chan->client, rc->tag); - req->status = REQ_STATUS_RCVD; - if (req->tc->private) { - struct trans_rpage_info *rp = req->tc->private; - /*Release pages */ - p9_release_req_pages(rp); - if (rp->rp_alloc) - kfree(rp); - req->tc->private = NULL; - } - p9_client_cb(chan->client, req); - } else { + if (rc == NULL) { spin_unlock_irqrestore(&chan->lock, flags); + break; } - } while (rc != NULL); + + chan->ring_bufs_avail = 1; + spin_unlock_irqrestore(&chan->lock, flags); + /* Wakeup if anyone waiting for VirtIO ring space. */ + wake_up(chan->vc_wq); + P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); + P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); + req = p9_tag_lookup(chan->client, rc->tag); + if (req->tc->private) { + struct trans_rpage_info *rp = req->tc->private; + /*Release pages */ + p9_release_req_pages(rp); + if (rp->rp_alloc) + kfree(rp); + req->tc->private = NULL; + } + req->status = REQ_STATUS_RCVD; + p9_client_cb(chan->client, req); + } } /** -- cgit v1.2.3-70-g09d2 From 316ad5501c2098cb2a2a25ed77a0421f1671411c Mon Sep 17 00:00:00 2001 From: "Venkateswararao Jujjuri (JV)" Date: Mon, 14 Mar 2011 14:22:41 -0700 Subject: [net/9p] Don't re-pin pages on retrying virtqueue_add_buf(). Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/trans_virtio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index cb98af9367a..c6e1ae2fb92 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -262,7 +262,6 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n"); -req_retry: req->status = REQ_STATUS_SENT; if (req->tc->pbuf_size && (req->tc->pubuf && P9_IS_USER_CONTEXT)) { @@ -295,6 +294,7 @@ req_retry: } } +req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); /* Handle out VirtIO ring buffers */ @@ -355,7 +355,7 @@ req_retry: return err; P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n"); - goto req_retry; + goto req_retry_pinned; } else { spin_unlock_irqrestore(&chan->lock, flags); P9_DPRINTK(P9_DEBUG_TRANS, -- cgit v1.2.3-70-g09d2 From 68da9ba4eeadae86ad42e52b80822fbd56971267 Mon Sep 17 00:00:00 2001 From: "Venkateswararao Jujjuri (JV)" Date: Fri, 18 Mar 2011 15:49:48 -0700 Subject: [net/9p]: Introduce basic flow-control for VirtIO transport. Recent zerocopy work in the 9P VirtIO transport maps and pins user buffers into kernel memory for the server to work on them. Since the user process can initiate this kind of pinning with a simple read/write call, thousands of IO threads initiated by the user process can hog the system resources and could result into denial of service. This patch introduces flow control to avoid that extreme scenario. The ceiling limit to avoid denial of service attacks is set to relatively high (nr_free_pagecache_pages()/4) so that it won't interfere with regular usage, but can step in extreme cases to limit the total system hang. Since we don't have a global structure to accommodate this variable, I choose the virtio_chan as the home for this. Signed-off-by: Venkateswararao Jujjuri Reviewed-by: Badari Pulavarty Signed-off-by: Eric Van Hensbergen --- net/9p/trans_virtio.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index c6e1ae2fb92..e8f046b0718 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include "trans_common.h" @@ -51,6 +52,8 @@ /* a single mutex to manage channel initialization and attachment */ static DEFINE_MUTEX(virtio_9p_lock); +static DECLARE_WAIT_QUEUE_HEAD(vp_wq); +static atomic_t vp_pinned = ATOMIC_INIT(0); /** * struct virtio_chan - per-instance transport information @@ -78,7 +81,10 @@ struct virtio_chan { struct virtqueue *vq; int ring_bufs_avail; wait_queue_head_t *vc_wq; - + /* This is global limit. Since we don't have a global structure, + * will be placing it in each channel. + */ + int p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; @@ -159,8 +165,11 @@ static void req_done(struct virtqueue *vq) req = p9_tag_lookup(chan->client, rc->tag); if (req->tc->private) { struct trans_rpage_info *rp = req->tc->private; + int p = rp->rp_nr_pages; /*Release pages */ p9_release_req_pages(rp); + atomic_sub(p, &vp_pinned); + wake_up(&vp_wq); if (rp->rp_alloc) kfree(rp); req->tc->private = NULL; @@ -269,6 +278,14 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) int rpinfo_size = sizeof(struct trans_rpage_info) + sizeof(struct page *) * nr_pages; + if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { + err = wait_event_interruptible(vp_wq, + atomic_read(&vp_pinned) < chan->p9_max_pages); + if (err == -ERESTARTSYS) + return err; + P9_DPRINTK(P9_DEBUG_TRANS, "9p: May gup pages now.\n"); + } + if (rpinfo_size <= (req->tc->capacity - req->tc->size)) { /* We can use sdata */ req->tc->private = req->tc->sdata + req->tc->size; @@ -291,6 +308,8 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) if (rpinfo->rp_alloc) kfree(rpinfo); return err; + } else { + atomic_add(rpinfo->rp_nr_pages, &vp_pinned); } } @@ -452,6 +471,8 @@ static int p9_virtio_probe(struct virtio_device *vdev) } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; + /* Ceiling limit to avoid denial of service attacks */ + chan->p9_max_pages = nr_free_buffer_pages()/4; mutex_lock(&virtio_9p_lock); list_add_tail(&chan->chan_list, &virtio_chan_list); -- cgit v1.2.3-70-g09d2