From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 13:03:51 +0000 Subject: net: sk_sleep() helper Define a new function to return the waitqueue of a "struct sock". static inline wait_queue_head_t *sk_sleep(struct sock *sk) { return sk->sk_sleep; } Change all read occurrences of sk_sleep by a call to this function. Needed for a future RCU conversion. sk_sleep wont be a field directly available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sctp/socket.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net/sctp/socket.c') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c1941276f6e..f34adcca8a8 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5702,7 +5702,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sctp_sock *sp = sctp_sk(sk); unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + poll_wait(file, sk_sleep(sk), wait); /* A TCP-style listening socket becomes readable when the accept queue * is not empty. @@ -5943,7 +5943,7 @@ static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p) int error; DEFINE_WAIT(wait); - prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Socket errors? */ error = sock_error(sk); @@ -5980,14 +5980,14 @@ static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p) sctp_lock_sock(sk); ready: - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return 0; interrupted: error = sock_intr_errno(*timeo_p); out: - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); *err = error; return error; } @@ -6061,8 +6061,8 @@ static void __sctp_write_space(struct sctp_association *asoc) wake_up_interruptible(&asoc->wait); if (sctp_writeable(sk)) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) + wake_up_interruptible(sk_sleep(sk)); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. @@ -6296,7 +6296,7 @@ static int sctp_wait_for_accept(struct sock *sk, long timeo) for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (list_empty(&ep->asocs)) { @@ -6322,7 +6322,7 @@ static int sctp_wait_for_accept(struct sock *sk, long timeo) break; } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return err; } @@ -6332,7 +6332,7 @@ static void sctp_wait_for_close(struct sock *sk, long timeout) DEFINE_WAIT(wait); do { - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (list_empty(&sctp_sk(sk)->ep->asocs)) break; sctp_release_sock(sk); @@ -6340,7 +6340,7 @@ static void sctp_wait_for_close(struct sock *sk, long timeout) sctp_lock_sock(sk); } while (!signal_pending(current) && timeout); - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); } static void sctp_skb_set_owner_r_frag(struct sk_buff *skb, struct sock *sk) -- cgit v1.2.3-70-g09d2 From c377411f2494a931ff7facdbb3a6839b1266bcf6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Apr 2010 15:13:20 -0700 Subject: net: sk_add_backlog() take rmem_alloc into account Current socket backlog limit is not enough to really stop DDOS attacks, because user thread spend many time to process a full backlog each round, and user might crazy spin on socket lock. We should add backlog size and receive_queue size (aka rmem_alloc) to pace writers, and let user run without being slow down too much. Introduce a sk_rcvqueues_full() helper, to avoid taking socket lock in stress situations. Under huge stress from a multiqueue/RPS enabled NIC, a single flow udp receiver can now process ~200.000 pps (instead of ~100 pps before the patch) on a 8 core machine. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 13 +++++++++++-- net/core/sock.c | 5 ++++- net/ipv4/udp.c | 4 ++++ net/ipv6/udp.c | 8 ++++++++ net/sctp/socket.c | 3 --- 5 files changed, 27 insertions(+), 6 deletions(-) (limited to 'net/sctp/socket.c') diff --git a/include/net/sock.h b/include/net/sock.h index 07822280d95..cf12b1e61fa 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -256,7 +256,6 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; int len; - int limit; } sk_backlog; wait_queue_head_t *sk_sleep; struct dst_entry *sk_dst_cache; @@ -608,10 +607,20 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) skb->next = NULL; } +/* + * Take into account size of receive queue and backlog queue + */ +static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) +{ + unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); + + return qsize + skb->truesize > sk->sk_rcvbuf; +} + /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb) { - if (sk->sk_backlog.len >= max(sk->sk_backlog.limit, sk->sk_rcvbuf << 1)) + if (sk_rcvqueues_full(sk, skb)) return -ENOBUFS; __sk_add_backlog(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 58ebd146ce5..51041759517 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -327,6 +327,10 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) skb->dev = NULL; + if (sk_rcvqueues_full(sk, skb)) { + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } if (nested) bh_lock_sock_nested(sk); else @@ -1885,7 +1889,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; - sk->sk_backlog.limit = sk->sk_rcvbuf << 1; sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fa3d2874db4..63eb56b2d87 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1372,6 +1372,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto drop; } + + if (sk_rcvqueues_full(sk, skb)) + goto drop; + rc = 0; bh_lock_sock(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2850e35cee3..3ead20ad9d0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -584,6 +584,10 @@ static void flush_stack(struct sock **stack, unsigned int count, sk = stack[i]; if (skb1) { + if (sk_rcvqueues_full(sk, skb)) { + kfree_skb(skb1); + goto drop; + } bh_lock_sock(sk); if (!sock_owned_by_user(sk)) udpv6_queue_rcv_skb(sk, skb1); @@ -759,6 +763,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, /* deliver */ + if (sk_rcvqueues_full(sk, skb)) { + sock_put(sk); + goto discard; + } bh_lock_sock(sk); if (!sock_owned_by_user(sk)) udpv6_queue_rcv_skb(sk, skb); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f34adcca8a8..13d8229f3a9 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3721,9 +3721,6 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) SCTP_DBG_OBJCNT_INC(sock); percpu_counter_inc(&sctp_sockets_allocated); - /* Set socket backlog limit. */ - sk->sk_backlog.limit = sysctl_sctp_rmem[1]; - local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); local_bh_enable(); -- cgit v1.2.3-70-g09d2 From 561b1733a465cf9677356b40c27653dd45f1ac56 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 28 Apr 2010 08:47:18 +0000 Subject: sctp: avoid irq lock inversion while call sk->sk_data_ready() sk->sk_data_ready() of sctp socket can be called from both BH and non-BH contexts, but the default sk->sk_data_ready(), sock_def_readable(), can not be used in this case. Therefore, we have to make a new function sctp_data_ready() to grab sk->sk_data_ready() with BH disabling. ========================================================= [ INFO: possible irq lock inversion dependency detected ] 2.6.33-rc6 #129 --------------------------------------------------------- sctp_darn/1517 just changed the state of lock: (clock-AF_INET){++.?..}, at: [] sock_def_readable+0x20/0x80 but this lock took another, SOFTIRQ-unsafe lock in the past: (slock-AF_INET){+.-...} and interrupts could create inverse lock ordering between them. other info that might help us debug this: 1 lock held by sctp_darn/1517: #0: (sk_lock-AF_INET){+.+.+.}, at: [] sctp_sendmsg+0x23d/0xc00 [sctp] Signed-off-by: Wei Yongjun Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 1 + net/sctp/endpointola.c | 1 + net/sctp/socket.c | 10 ++++++++++ 3 files changed, 12 insertions(+) (limited to 'net/sctp/socket.c') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 78740ec57d5..fa6cde578a1 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -128,6 +128,7 @@ extern int sctp_register_pf(struct sctp_pf *, sa_family_t); int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); +void sctp_data_ready(struct sock *sk, int len); unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 905fda582b9..7ec09ba03a1 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -144,6 +144,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = sctp_sndbuf_policy; + sk->sk_data_ready = sctp_data_ready; sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 007e8baba08..efa2bc3f002 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6189,6 +6189,16 @@ do_nonblock: goto out; } +void sctp_data_ready(struct sock *sk, int len) +{ + read_lock_bh(&sk->sk_callback_lock); + if (sk_has_sleeper(sk)) + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | + POLLRDNORM | POLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + read_unlock_bh(&sk->sk_callback_lock); +} + /* If socket sndbuf has changed, wake up all per association waiters. */ void sctp_write_space(struct sock *sk) { -- cgit v1.2.3-70-g09d2 From 81419d862db743fe4450a021893f24bab4698c1d Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 28 Apr 2010 08:47:20 +0000 Subject: sctp: per_cpu variables should be in bh_disabled section Since the change of the atomics to percpu variables, we now have to disable BH in process context when touching percpu variables. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sctp/socket.c') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index efa2bc3f002..44a1ab03a3f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3719,12 +3719,12 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) sp->hmac = NULL; SCTP_DBG_OBJCNT_INC(sock); - percpu_counter_inc(&sctp_sockets_allocated); /* Set socket backlog limit. */ sk->sk_backlog.limit = sysctl_sctp_rmem[1]; local_bh_disable(); + percpu_counter_inc(&sctp_sockets_allocated); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); local_bh_enable(); @@ -3741,8 +3741,8 @@ SCTP_STATIC void sctp_destroy_sock(struct sock *sk) /* Release our hold on the endpoint. */ ep = sctp_sk(sk)->ep; sctp_endpoint_free(ep); - percpu_counter_dec(&sctp_sockets_allocated); local_bh_disable(); + percpu_counter_dec(&sctp_sockets_allocated); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); } -- cgit v1.2.3-70-g09d2 From a5f4cea74f1397bb29d0bbdabeb05bd05a23a741 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 21:42:42 -0400 Subject: sctp: Use correct address family in sctp_getsockopt_peer_addrs() The function should use the address family of the address when trying to determine the length of the structure. Signed-off-by: Vlad Yasevich --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sctp/socket.c') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 13d8229f3a9..1282a0ed855 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4384,7 +4384,7 @@ static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, transports) { memcpy(&temp, &from->ipaddr, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); - addrlen = sctp_get_af_specific(sk->sk_family)->sockaddr_len; + addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; if (space_left < addrlen) return -ENOMEM; if (copy_to_user(to, &temp, addrlen)) -- cgit v1.2.3-70-g09d2 From 43815482370c510c569fd18edb57afcb0fa8cab6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Apr 2010 11:01:49 +0000 Subject: net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 13 +++++++++---- drivers/net/tun.c | 21 ++++++++++++--------- include/linux/net.h | 14 +++++++++----- include/net/af_unix.h | 20 +++++++++++--------- include/net/sock.h | 38 +++++++++++++++++++------------------- net/atm/common.c | 22 ++++++++++++++-------- net/core/sock.c | 50 +++++++++++++++++++++++++++++++------------------- net/core/stream.c | 10 +++++++--- net/dccp/output.c | 10 ++++++---- net/iucv/af_iucv.c | 11 +++++++---- net/phonet/pep.c | 8 ++++---- net/phonet/socket.c | 2 +- net/rxrpc/af_rxrpc.c | 10 ++++++---- net/sctp/socket.c | 2 +- net/socket.c | 47 ++++++++++++++++++++++++++++++++++++----------- net/unix/af_unix.c | 17 ++++++++--------- 16 files changed, 181 insertions(+), 114 deletions(-) (limited to 'net/sctp/socket.c') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index d97e1fd234b..1c4110df343 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -37,6 +37,7 @@ struct macvtap_queue { struct sock sk; struct socket sock; + struct socket_wq wq; struct macvlan_dev *vlan; struct file *file; unsigned int flags; @@ -242,12 +243,15 @@ static struct rtnl_link_ops macvtap_link_ops __read_mostly = { static void macvtap_sock_write_space(struct sock *sk) { + wait_queue_head_t *wqueue; + if (!sock_writeable(sk) || !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; - if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND); + wqueue = sk_sleep(sk); + if (wqueue && waitqueue_active(wqueue)) + wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); } static int macvtap_open(struct inode *inode, struct file *file) @@ -272,7 +276,8 @@ static int macvtap_open(struct inode *inode, struct file *file) if (!q) goto out; - init_waitqueue_head(&q->sock.wait); + q->sock.wq = &q->wq; + init_waitqueue_head(&q->wq.wait); q->sock.type = SOCK_RAW; q->sock.state = SS_CONNECTED; q->sock.file = file; @@ -308,7 +313,7 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait) goto out; mask = 0; - poll_wait(file, &q->sock.wait, wait); + poll_wait(file, &q->wq.wait, wait); if (!skb_queue_empty(&q->sk.sk_receive_queue)) mask |= POLLIN | POLLRDNORM; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 20a17938c62..e525a6cf558 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -109,7 +109,7 @@ struct tun_struct { struct tap_filter txflt; struct socket socket; - + struct socket_wq wq; #ifdef TUN_DEBUG int debug; #endif @@ -323,7 +323,7 @@ static void tun_net_uninit(struct net_device *dev) /* Inform the methods they need to stop using the dev. */ if (tfile) { - wake_up_all(&tun->socket.wait); + wake_up_all(&tun->wq.wait); if (atomic_dec_and_test(&tfile->count)) __tun_detach(tun); } @@ -398,7 +398,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) /* Notify and wake up reader process */ if (tun->flags & TUN_FASYNC) kill_fasync(&tun->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tun->socket.wait, POLLIN | + wake_up_interruptible_poll(&tun->wq.wait, POLLIN | POLLRDNORM | POLLRDBAND); return NETDEV_TX_OK; @@ -498,7 +498,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait) DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name); - poll_wait(file, &tun->socket.wait, wait); + poll_wait(file, &tun->wq.wait, wait); if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; @@ -773,7 +773,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name); - add_wait_queue(&tun->socket.wait, &wait); + add_wait_queue(&tun->wq.wait, &wait); while (len) { current->state = TASK_INTERRUPTIBLE; @@ -804,7 +804,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, } current->state = TASK_RUNNING; - remove_wait_queue(&tun->socket.wait, &wait); + remove_wait_queue(&tun->wq.wait, &wait); return ret; } @@ -861,6 +861,7 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = { static void tun_sock_write_space(struct sock *sk) { struct tun_struct *tun; + wait_queue_head_t *wqueue; if (!sock_writeable(sk)) return; @@ -868,8 +869,9 @@ static void tun_sock_write_space(struct sock *sk) if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; - if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT | + wqueue = sk_sleep(sk); + if (wqueue && waitqueue_active(wqueue)) + wake_up_interruptible_sync_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); tun = tun_sk(sk)->tun; @@ -1039,7 +1041,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (!sk) goto err_free_dev; - init_waitqueue_head(&tun->socket.wait); + tun->socket.wq = &tun->wq; + init_waitqueue_head(&tun->wq.wait); tun->socket.ops = &tun_socket_ops; sock_init_data(&tun->socket, sk); sk->sk_write_space = tun_sock_write_space; diff --git a/include/linux/net.h b/include/linux/net.h index 4157b5d42bd..2b4deeeb864 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -59,6 +59,7 @@ typedef enum { #include #include /* For O_CLOEXEC and O_NONBLOCK */ #include +#include struct poll_table_struct; struct pipe_inode_info; @@ -116,6 +117,12 @@ enum sock_shutdown_cmd { SHUT_RDWR = 2, }; +struct socket_wq { + wait_queue_head_t wait; + struct fasync_struct *fasync_list; + struct rcu_head rcu; +} ____cacheline_aligned_in_smp; + /** * struct socket - general BSD socket * @state: socket state (%SS_CONNECTED, etc) @@ -135,11 +142,8 @@ struct socket { kmemcheck_bitfield_end(type); unsigned long flags; - /* - * Please keep fasync_list & wait fields in the same cache line - */ - struct fasync_struct *fasync_list; - wait_queue_head_t wait; + + struct socket_wq *wq; struct file *file; struct sock *sk; diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 1614d78c60e..20725e213ae 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -30,7 +30,7 @@ struct unix_skb_parms { #endif }; -#define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb)) +#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) #define UNIXCREDS(skb) (&UNIXCB((skb)).creds) #define UNIXSID(skb) (&UNIXCB((skb)).secid) @@ -45,21 +45,23 @@ struct unix_skb_parms { struct unix_sock { /* WARNING: sk has to be the first member */ struct sock sk; - struct unix_address *addr; - struct dentry *dentry; - struct vfsmount *mnt; + struct unix_address *addr; + struct dentry *dentry; + struct vfsmount *mnt; struct mutex readlock; - struct sock *peer; - struct sock *other; + struct sock *peer; + struct sock *other; struct list_head link; - atomic_long_t inflight; - spinlock_t lock; + atomic_long_t inflight; + spinlock_t lock; unsigned int gc_candidate : 1; unsigned int gc_maybe_cycle : 1; - wait_queue_head_t peer_wait; + struct socket_wq peer_wq; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) +#define peer_wait peer_wq.wait + #ifdef CONFIG_SYSCTL extern int unix_sysctl_register(struct net *net); extern void unix_sysctl_unregister(struct net *net); diff --git a/include/net/sock.h b/include/net/sock.h index e1777db5b9a..cc7f91ec972 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -159,7 +159,7 @@ struct sock_common { * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_rcvbuf: size of receive buffer in bytes - * @sk_sleep: sock wait queue + * @sk_wq: sock wait queue and async head * @sk_dst_cache: destination cache * @sk_dst_lock: destination cache lock * @sk_policy: flow policy @@ -257,7 +257,7 @@ struct sock { struct sk_buff *tail; int len; } sk_backlog; - wait_queue_head_t *sk_sleep; + struct socket_wq *sk_wq; struct dst_entry *sk_dst_cache; #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; @@ -1219,7 +1219,7 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) static inline wait_queue_head_t *sk_sleep(struct sock *sk) { - return sk->sk_sleep; + return &sk->sk_wq->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. @@ -1233,14 +1233,14 @@ static inline void sock_orphan(struct sock *sk) write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); - sk->sk_sleep = NULL; + sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); - sk->sk_sleep = &parent->wait; + rcu_assign_pointer(sk->sk_wq, parent->wq); parent->sk = sk; sk_set_socket(sk, parent); security_sock_graft(sk, parent); @@ -1392,12 +1392,12 @@ static inline int sk_has_allocations(const struct sock *sk) } /** - * sk_has_sleeper - check if there are any waiting processes - * @sk: socket + * wq_has_sleeper - check if there are any waiting processes + * @sk: struct socket_wq * - * Returns true if socket has waiting processes + * Returns true if socket_wq has waiting processes * - * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory + * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths: @@ -1410,9 +1410,10 @@ static inline int sk_has_allocations(const struct sock *sk) * ... ... * tp->rcv_nxt check sock_def_readable * ... { - * schedule ... - * if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - * wake_up_interruptible(sk_sleep(sk)) + * schedule rcu_read_lock(); + * wq = rcu_dereference(sk->sk_wq); + * if (wq && waitqueue_active(&wq->wait)) + * wake_up_interruptible(&wq->wait) * ... * } * @@ -1421,19 +1422,18 @@ static inline int sk_has_allocations(const struct sock *sk) * could then endup calling schedule and sleep forever if there are no more * data on the socket. * - * The sk_has_sleeper is always called right after a call to read_lock, so we - * can use smp_mb__after_lock barrier. */ -static inline int sk_has_sleeper(struct sock *sk) +static inline bool wq_has_sleeper(struct socket_wq *wq) { + /* * We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier is paired in the sock_poll_wait. */ - smp_mb__after_lock(); - return sk_sleep(sk) && waitqueue_active(sk_sleep(sk)); + smp_mb(); + return wq && waitqueue_active(&wq->wait); } /** @@ -1442,7 +1442,7 @@ static inline int sk_has_sleeper(struct sock *sk) * @wait_address: socket wait queue * @p: poll_table * - * See the comments in the sk_has_sleeper function. + * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) @@ -1453,7 +1453,7 @@ static inline void sock_poll_wait(struct file *filp, * We need to be sure we are in sync with the * socket flags modification. * - * This memory barrier is paired in the sk_has_sleeper. + * This memory barrier is paired in the wq_has_sleeper. */ smp_mb(); } diff --git a/net/atm/common.c b/net/atm/common.c index e3e10e6f862..b43feb1a399 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -90,10 +90,13 @@ static void vcc_sock_destruct(struct sock *sk) static void vcc_def_wakeup(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up(sk_sleep(sk)); - read_unlock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up(&wq->wait); + rcu_read_unlock(); } static inline int vcc_writable(struct sock *sk) @@ -106,16 +109,19 @@ static inline int vcc_writable(struct sock *sk) static void vcc_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); if (vcc_writable(sk)) { - if (sk_has_sleeper(sk)) - wake_up_interruptible(sk_sleep(sk)); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static struct proto vcc_proto = { diff --git a/net/core/sock.c b/net/core/sock.c index 51041759517..94c4affdda9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1211,7 +1211,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) */ sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); - newsk->sk_sleep = NULL; + newsk->sk_wq = NULL; if (newsk->sk_prot->sockets_allocated) percpu_counter_inc(newsk->sk_prot->sockets_allocated); @@ -1800,41 +1800,53 @@ EXPORT_SYMBOL(sock_no_sendpage); static void sock_def_wakeup(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_all(sk_sleep(sk)); - read_unlock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_poll(sk_sleep(sk), POLLERR); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_readable(struct sock *sk, int len) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync_poll(sk_sleep(sk), POLLIN | + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT | + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); /* Should agree with poll, otherwise some programs break */ @@ -1842,7 +1854,7 @@ static void sock_def_write_space(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } static void sock_def_destruct(struct sock *sk) @@ -1896,10 +1908,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) if (sock) { sk->sk_type = sock->type; - sk->sk_sleep = &sock->wait; + sk->sk_wq = sock->wq; sock->sk = sk; } else - sk->sk_sleep = NULL; + sk->sk_wq = NULL; spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); diff --git a/net/core/stream.c b/net/core/stream.c index 7b3c3f30b10..cc196f42b8d 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -28,15 +28,19 @@ void sk_stream_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; + struct socket_wq *wq; if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); - if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); - if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); } } diff --git a/net/dccp/output.c b/net/dccp/output.c index 2d3dcb39851..aadbdb58758 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -195,15 +195,17 @@ EXPORT_SYMBOL_GPL(dccp_sync_mss); void dccp_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; - if (sk_has_sleeper(sk)) - wake_up_interruptible(sk_sleep(sk)); + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /** diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 9636b7d27b4..8be324fe08b 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -305,11 +305,14 @@ static inline int iucv_below_msglim(struct sock *sk) */ static void iucv_sock_wake_msglim(struct sock *sk) { - read_lock(&sk->sk_callback_lock); - if (sk_has_sleeper(sk)) - wake_up_interruptible_all(sk_sleep(sk)); + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /* Timers */ diff --git a/net/phonet/pep.c b/net/phonet/pep.c index e2a95762abd..af4d38bc3b2 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -664,12 +664,12 @@ static int pep_wait_connreq(struct sock *sk, int noblock) if (signal_pending(tsk)) return sock_intr_errno(timeo); - prepare_to_wait_exclusive(&sk->sk_socket->wait, &wait, + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); - finish_wait(&sk->sk_socket->wait, &wait); + finish_wait(sk_sleep(sk), &wait); } return 0; @@ -910,10 +910,10 @@ disabled: goto out; } - prepare_to_wait(&sk->sk_socket->wait, &wait, + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits)); - finish_wait(&sk->sk_socket->wait, &wait); + finish_wait(sk_sleep(sk), &wait); if (sk->sk_state != TCP_ESTABLISHED) goto disabled; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index c785bfd0744..6e9848bf037 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -265,7 +265,7 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock, struct pep_sock *pn = pep_sk(sk); unsigned int mask = 0; - poll_wait(file, &sock->wait, wait); + poll_wait(file, sk_sleep(sk), wait); switch (sk->sk_state) { case TCP_LISTEN: diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index c432d76f415..0b9bb2085ce 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -62,13 +62,15 @@ static inline int rxrpc_writable(struct sock *sk) static void rxrpc_write_space(struct sock *sk) { _enter("%p", sk); - read_lock(&sk->sk_callback_lock); + rcu_read_lock(); if (rxrpc_writable(sk)) { - if (sk_has_sleeper(sk)) - wake_up_interruptible(sk_sleep(sk)); + struct socket_wq *wq = rcu_dereference(sk->sk_wq); + + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 13d8229f3a9..d54700af927 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6065,7 +6065,7 @@ static void __sctp_write_space(struct sctp_association *asoc) * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ - if (sock->fasync_list && + if (sock->wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); diff --git a/net/socket.c b/net/socket.c index cb7c1f6c0d6..dae8c6b84a0 100644 --- a/net/socket.c +++ b/net/socket.c @@ -252,9 +252,14 @@ static struct inode *sock_alloc_inode(struct super_block *sb) ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; - init_waitqueue_head(&ei->socket.wait); + ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL); + if (!ei->socket.wq) { + kmem_cache_free(sock_inode_cachep, ei); + return NULL; + } + init_waitqueue_head(&ei->socket.wq->wait); + ei->socket.wq->fasync_list = NULL; - ei->socket.fasync_list = NULL; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; ei->socket.ops = NULL; @@ -264,10 +269,21 @@ static struct inode *sock_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } + +static void wq_free_rcu(struct rcu_head *head) +{ + struct socket_wq *wq = container_of(head, struct socket_wq, rcu); + + kfree(wq); +} + static void sock_destroy_inode(struct inode *inode) { - kmem_cache_free(sock_inode_cachep, - container_of(inode, struct socket_alloc, vfs_inode)); + struct socket_alloc *ei; + + ei = container_of(inode, struct socket_alloc, vfs_inode); + call_rcu(&ei->socket.wq->rcu, wq_free_rcu); + kmem_cache_free(sock_inode_cachep, ei); } static void init_once(void *foo) @@ -513,7 +529,7 @@ void sock_release(struct socket *sock) module_put(owner); } - if (sock->fasync_list) + if (sock->wq->fasync_list) printk(KERN_ERR "sock_release: fasync list not empty!\n"); percpu_sub(sockets_in_use, 1); @@ -1080,9 +1096,9 @@ static int sock_fasync(int fd, struct file *filp, int on) lock_sock(sk); - fasync_helper(fd, filp, on, &sock->fasync_list); + fasync_helper(fd, filp, on, &sock->wq->fasync_list); - if (!sock->fasync_list) + if (!sock->wq->fasync_list) sock_reset_flag(sk, SOCK_FASYNC); else sock_set_flag(sk, SOCK_FASYNC); @@ -1091,12 +1107,20 @@ static int sock_fasync(int fd, struct file *filp, int on) return 0; } -/* This function may be called only under socket lock or callback_lock */ +/* This function may be called only under socket lock or callback_lock or rcu_lock */ int sock_wake_async(struct socket *sock, int how, int band) { - if (!sock || !sock->fasync_list) + struct socket_wq *wq; + + if (!sock) return -1; + rcu_read_lock(); + wq = rcu_dereference(sock->wq); + if (!wq || !wq->fasync_list) { + rcu_read_unlock(); + return -1; + } switch (how) { case SOCK_WAKE_WAITD: if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) @@ -1108,11 +1132,12 @@ int sock_wake_async(struct socket *sock, int how, int band) /* fall through */ case SOCK_WAKE_IO: call_kill: - kill_fasync(&sock->fasync_list, SIGIO, band); + kill_fasync(&wq->fasync_list, SIGIO, band); break; case SOCK_WAKE_URG: - kill_fasync(&sock->fasync_list, SIGURG, band); + kill_fasync(&wq->fasync_list, SIGURG, band); } + rcu_read_unlock(); return 0; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 87c0360eaa2..fef2cc5e9d2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -313,13 +313,16 @@ static inline int unix_writable(struct sock *sk) static void unix_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); if (unix_writable(sk)) { - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync(sk_sleep(sk)); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /* When dgram socket disconnects (or changes its peer), we clear its receive @@ -406,9 +409,7 @@ static int unix_release_sock(struct sock *sk, int embrion) skpair->sk_err = ECONNRESET; unix_state_unlock(skpair); skpair->sk_state_change(skpair); - read_lock(&skpair->sk_callback_lock); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); - read_unlock(&skpair->sk_callback_lock); } sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; @@ -1142,7 +1143,7 @@ restart: newsk->sk_peercred.pid = task_tgid_vnr(current); current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid); newu = unix_sk(newsk); - newsk->sk_sleep = &newu->peer_wait; + newsk->sk_wq = &newu->peer_wq; otheru = unix_sk(other); /* copy address information from listening to new sock*/ @@ -1931,12 +1932,10 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - read_lock(&other->sk_callback_lock); if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); - read_unlock(&other->sk_callback_lock); } if (other) sock_put(other); -- cgit v1.2.3-70-g09d2