From 48dcc33e5e11de0f76b65b113988dbc930d17395 Mon Sep 17 00:00:00 2001 From: Jianjun Kong Date: Sat, 1 Nov 2008 21:37:27 -0700 Subject: af_unix: netns: fix problem of return value fix problem of return value net/unix/af_unix.c: unix_net_init() when error appears, it should return 'error', not always return 0. Signed-off-by: Jianjun Kong Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dc504d308ec..4d3c6071b9a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2213,7 +2213,7 @@ static int unix_net_init(struct net *net) #endif error = 0; out: - return 0; + return error; } static void unix_net_exit(struct net *net) -- cgit v1.2.3-70-g09d2 From 6209344f5a3795d34b7f2c0061f49802283b6bdd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sun, 9 Nov 2008 15:23:57 +0100 Subject: net: unix: fix inflight counting bug in garbage collector Previously I assumed that the receive queues of candidates don't change during the GC. This is only half true, nothing can be received from the queues (see comment in unix_gc()), but buffers could be added through the other half of the socket pair, which may still have file descriptors referring to it. This can result in inc_inflight_move_tail() erronously increasing the "inflight" counter for a unix socket for which dec_inflight() wasn't previously called. This in turn can trigger the "BUG_ON(total_refs < inflight_refs)" in a later garbage collection run. Fix this by only manipulating the "inflight" counter for sockets which are candidates themselves. Duplicating the file references in unix_attach_fds() is also needed to prevent a socket becoming a candidate for GC while the skb that contains it is not yet queued. Reported-by: Andrea Bittau Signed-off-by: Miklos Szeredi CC: stable@kernel.org Signed-off-by: Linus Torvalds --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 31 ++++++++++++++++++++++++------- net/unix/garbage.c | 49 +++++++++++++++++++++++++++++++++++++------------ 3 files changed, 62 insertions(+), 19 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 7dd29b7e461..c29ff1da8a1 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -54,6 +54,7 @@ struct unix_sock { atomic_long_t inflight; spinlock_t lock; unsigned int gc_candidate : 1; + unsigned int gc_maybe_cycle : 1; wait_queue_head_t peer_wait; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4d3c6071b9a..eb90f77bb0e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1302,14 +1302,23 @@ static void unix_destruct_fds(struct sk_buff *skb) sock_wfree(skb); } -static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; + + /* + * Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + for (i=scm->fp->count-1; i>=0; i--) unix_inflight(scm->fp->fp[i]); - UNIXCB(skb).fp = scm->fp; skb->destructor = unix_destruct_fds; - scm->fp = NULL; + return 0; } /* @@ -1368,8 +1377,11 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - if (siocb->scm->fp) - unix_attach_fds(siocb->scm, skb); + if (siocb->scm->fp) { + err = unix_attach_fds(siocb->scm, skb); + if (err) + goto out_free; + } unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1538,8 +1550,13 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, size = min_t(int, size, skb_tailroom(skb)); memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - if (siocb->scm->fp) - unix_attach_fds(siocb->scm, skb); + if (siocb->scm->fp) { + err = unix_attach_fds(siocb->scm, skb); + if (err) { + kfree_skb(skb); + goto out_err; + } + } if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) { kfree_skb(skb); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2a27b84f740..6d4a9a8de5e 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -186,8 +186,17 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), */ struct sock *sk = unix_get_socket(*fp++); if (sk) { - hit = true; - func(unix_sk(sk)); + struct unix_sock *u = unix_sk(sk); + + /* + * Ignore non-candidates, they could + * have been added to the queues after + * starting the garbage collection + */ + if (u->gc_candidate) { + hit = true; + func(u); + } } } if (hit && hitlist != NULL) { @@ -249,11 +258,11 @@ static void inc_inflight_move_tail(struct unix_sock *u) { atomic_long_inc(&u->inflight); /* - * If this is still a candidate, move it to the end of the - * list, so that it's checked even if it was already passed - * over + * If this still might be part of a cycle, move it to the end + * of the list, so that it's checked even if it was already + * passed over */ - if (u->gc_candidate) + if (u->gc_maybe_cycle) list_move_tail(&u->link, &gc_candidates); } @@ -267,6 +276,7 @@ void unix_gc(void) struct unix_sock *next; struct sk_buff_head hitlist; struct list_head cursor; + LIST_HEAD(not_cycle_list); spin_lock(&unix_gc_lock); @@ -282,10 +292,14 @@ void unix_gc(void) * * Holding unix_gc_lock will protect these candidates from * being detached, and hence from gaining an external - * reference. This also means, that since there are no - * possible receivers, the receive queues of these sockets are - * static during the GC, even though the dequeue is done - * before the detach without atomicity guarantees. + * reference. Since there are no possible receivers, all + * buffers currently on the candidates' queues stay there + * during the garbage collection. + * + * We also know that no new candidate can be added onto the + * receive queues. Other, non candidate sockets _can_ be + * added to queue, so we must make sure only to touch + * candidates. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { long total_refs; @@ -299,6 +313,7 @@ void unix_gc(void) if (total_refs == inflight_refs) { list_move_tail(&u->link, &gc_candidates); u->gc_candidate = 1; + u->gc_maybe_cycle = 1; } } @@ -325,13 +340,23 @@ void unix_gc(void) list_move(&cursor, &u->link); if (atomic_long_read(&u->inflight) > 0) { - list_move_tail(&u->link, &gc_inflight_list); - u->gc_candidate = 0; + list_move_tail(&u->link, ¬_cycle_list); + u->gc_maybe_cycle = 0; scan_children(&u->sk, inc_inflight_move_tail, NULL); } } list_del(&cursor); + /* + * not_cycle_list contains those sockets which do not make up a + * cycle. Restore these to the inflight list. + */ + while (!list_empty(¬_cycle_list)) { + u = list_entry(not_cycle_list.next, struct unix_sock, link); + u->gc_candidate = 0; + list_move_tail(&u->link, &gc_inflight_list); + } + /* * Now gc_candidates contains only garbage. Restore original * inflight counters for these as well, and remove the skbuffs -- cgit v1.2.3-70-g09d2 From 5f23b734963ec7eaa3ebcd9050da0c9b7d143dd3 Mon Sep 17 00:00:00 2001 From: dann frazier Date: Wed, 26 Nov 2008 15:32:27 -0800 Subject: net: Fix soft lockups/OOM issues w/ unix garbage collector This is an implementation of David Miller's suggested fix in: https://bugzilla.redhat.com/show_bug.cgi?id=470201 It has been updated to use wait_event() instead of wait_event_interruptible(). Paraphrasing the description from the above report, it makes sendmsg() block while UNIX garbage collection is in progress. This avoids a situation where child processes continue to queue new FDs over a AF_UNIX socket to a parent which is in the exit path and running garbage collection on these FDs. This contention can result in soft lockups and oom-killing of unrelated processes. Signed-off-by: dann frazier Signed-off-by: David S. Miller --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 2 ++ net/unix/garbage.c | 13 ++++++++++--- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index c29ff1da8a1..1614d78c60e 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -9,6 +9,7 @@ extern void unix_inflight(struct file *fp); extern void unix_notinflight(struct file *fp); extern void unix_gc(void); +extern void wait_for_unix_gc(void); #define UNIX_HASH_SIZE 256 diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index eb90f77bb0e..66d5ac4773a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1343,6 +1343,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; + wait_for_unix_gc(); err = scm_send(sock, msg, siocb->scm); if (err < 0) return err; @@ -1493,6 +1494,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; + wait_for_unix_gc(); err = scm_send(sock, msg, siocb->scm); if (err < 0) return err; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6d4a9a8de5e..abb3ab34cb1 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -91,6 +92,7 @@ static LIST_HEAD(gc_inflight_list); static LIST_HEAD(gc_candidates); static DEFINE_SPINLOCK(unix_gc_lock); +static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); unsigned int unix_tot_inflight; @@ -266,12 +268,16 @@ static void inc_inflight_move_tail(struct unix_sock *u) list_move_tail(&u->link, &gc_candidates); } -/* The external entry point: unix_gc() */ +static bool gc_in_progress = false; -void unix_gc(void) +void wait_for_unix_gc(void) { - static bool gc_in_progress = false; + wait_event(unix_gc_wait, gc_in_progress == false); +} +/* The external entry point: unix_gc() */ +void unix_gc(void) +{ struct unix_sock *u; struct unix_sock *next; struct sk_buff_head hitlist; @@ -376,6 +382,7 @@ void unix_gc(void) /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); gc_in_progress = false; + wake_up(&unix_gc_wait); out: spin_unlock(&unix_gc_lock); -- cgit v1.2.3-70-g09d2