From 2b85a34e911bf483c27cfdd124aeb1605145dc80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2009 02:55:43 -0700 Subject: net: No more expensive sock_hold()/sock_put() on each tx One of the problem with sock memory accounting is it uses a pair of sock_hold()/sock_put() for each transmitted packet. This slows down bidirectional flows because the receive path also needs to take a refcount on socket and might use a different cpu than transmit path or transmit completion path. So these two atomic operations also trigger cache line bounces. We can see this in tx or tx/rx workloads (media gateways for example), where sock_wfree() can be in top five functions in profiles. We use this sock_hold()/sock_put() so that sock freeing is delayed until all tx packets are completed. As we also update sk_wmem_alloc, we could offset sk_wmem_alloc by one unit at init time, until sk_free() is called. Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc) to decrement initial offset and atomicaly check if any packets are in flight. skb_set_owner_w() doesnt call sock_hold() anymore sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc reached 0 to perform the final freeing. Drawback is that a skb->truesize error could lead to unfreeable sockets, or even worse, prematurely calling __sk_free() on a live socket. Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt contention point. 5 % speedup on a UDP transmit workload (depends on number of flows), lowering TX completion cpu usage. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 4bb1ff9fd15..010e14a93c9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1217,9 +1217,13 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from, static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { - sock_hold(sk); skb->sk = sk; skb->destructor = sock_wfree; + /* + * We used to take a refcount on sk, but following operation + * is enough to guarantee sk_free() wont free this sock until + * all in-flight packets are completed + */ atomic_add(skb->truesize, &sk->sk_wmem_alloc); } -- cgit v1.2.3-70-g09d2 From a98b65a3ad71e702e760bc63f57684301628e837 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 26 Feb 2009 14:46:57 +0100 Subject: net: annotate struct sock bitfield 2009/2/24 Ingo Molnar : > ok, this is the last warning i have from today's overnight -tip > testruns - a 32-bit system warning in sock_init_data(): > > [ 2.610389] NET: Registered protocol family 16 > [ 2.616138] initcall netlink_proto_init+0x0/0x170 returned 0 after 7812 usecs > [ 2.620010] WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (f642c184) > [ 2.624002] 010000000200000000000000604990c000000000000000000000000000000000 > [ 2.634076] i i i i i i u u i i i i i i i i i i i i i i i i i i i i i i i i > [ 2.641038] ^ > [ 2.643376] > [ 2.644004] Pid: 1, comm: swapper Not tainted (2.6.29-rc6-tip-01751-g4d1c22c-dirty #885) > [ 2.648003] EIP: 0060:[] EFLAGS: 00010282 CPU: 0 > [ 2.652008] EIP is at sock_init_data+0xa1/0x190 > [ 2.656003] EAX: 0001a800 EBX: f6836c00 ECX: 00463000 EDX: c0e46fe0 > [ 2.660003] ESI: f642c180 EDI: c0b83088 EBP: f6863ed8 ESP: c0c412ec > [ 2.664003] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 > [ 2.668003] CR0: 8005003b CR2: f682c400 CR3: 00b91000 CR4: 000006f0 > [ 2.672003] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 > [ 2.676003] DR6: ffff4ff0 DR7: 00000400 > [ 2.680002] [] __netlink_create+0x35/0xa0 > [ 2.684002] [] netlink_kernel_create+0x4c/0x140 > [ 2.688002] [] rtnetlink_net_init+0x1e/0x40 > [ 2.696002] [] register_pernet_operations+0x11/0x30 > [ 2.700002] [] register_pernet_subsys+0x1c/0x30 > [ 2.704002] [] rtnetlink_init+0x4c/0x100 > [ 2.708002] [] netlink_proto_init+0x159/0x170 > [ 2.712002] [] do_one_initcall+0x24/0x150 > [ 2.716002] [] do_initcalls+0x27/0x40 > [ 2.723201] [] do_basic_setup+0x1c/0x20 > [ 2.728002] [] kernel_init+0x5a/0xa0 > [ 2.732002] [] kernel_thread_helper+0x7/0x10 > [ 2.736002] [] 0xffffffff We fix this false positive by annotating the bitfield in struct sock. Reported-by: Ingo Molnar Signed-off-by: Vegard Nossum --- include/net/sock.h | 2 ++ net/core/sock.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 4bb1ff9fd15..d933da00d50 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -218,9 +218,11 @@ struct sock { #define sk_hash __sk_common.skc_hash #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net + kmemcheck_bitfield_begin(flags); unsigned char sk_shutdown : 2, sk_no_check : 2, sk_userlocks : 4; + kmemcheck_bitfield_end(flags); unsigned char sk_protocol; unsigned short sk_type; int sk_rcvbuf; diff --git a/net/core/sock.c b/net/core/sock.c index 7dbf3ffb35c..ce72c0ae424 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -941,6 +941,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { + kmemcheck_annotate_bitfield(sk, flags); + if (security_sk_alloc(sk, family, priority)) goto out_free; -- cgit v1.2.3-70-g09d2 From c564039fd83ea16a86a96d52632794b24849e507 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Jun 2009 10:12:03 +0000 Subject: net: sk_wmem_alloc has initial value of one, not zero commit 2b85a34e911bf483c27cfdd124aeb1605145dc80 (net: No more expensive sock_hold()/sock_put() on each tx) changed initial sk_wmem_alloc value. Some protocols check sk_wmem_alloc value to determine if a timer must delay socket deallocation. We must take care of the sk_wmem_alloc value being one instead of zero when no write allocations are pending. Reported by Ingo Molnar, and full diagnostic from David Miller. This patch introduces three helpers to get read/write allocations and a followup patch will use these helpers to report correct write allocations to user. Reported-by: Ingo Molnar Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 33 +++++++++++++++++++++++++++++++++ net/appletalk/ddp.c | 6 ++---- net/ax25/af_ax25.c | 3 +-- net/econet/af_econet.c | 6 ++---- net/netrom/af_netrom.c | 3 +-- net/rose/af_rose.c | 3 +-- net/x25/af_x25.c | 3 +-- 7 files changed, 41 insertions(+), 16 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 010e14a93c9..570c7a12b54 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1206,6 +1206,39 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from, return 0; } +/** + * sk_wmem_alloc_get - returns write allocations + * @sk: socket + * + * Returns sk_wmem_alloc minus initial offset of one + */ +static inline int sk_wmem_alloc_get(const struct sock *sk) +{ + return atomic_read(&sk->sk_wmem_alloc) - 1; +} + +/** + * sk_rmem_alloc_get - returns read allocations + * @sk: socket + * + * Returns sk_rmem_alloc + */ +static inline int sk_rmem_alloc_get(const struct sock *sk) +{ + return atomic_read(&sk->sk_rmem_alloc); +} + +/** + * sk_has_allocations - check if allocations are outstanding + * @sk: socket + * + * Returns true if socket has write or read allocations + */ +static inline int sk_has_allocations(const struct sock *sk) +{ + return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); +} + /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index b603cbacdc5..f7a53b219ef 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -162,8 +162,7 @@ static void atalk_destroy_timer(unsigned long data) { struct sock *sk = (struct sock *)data; - if (atomic_read(&sk->sk_wmem_alloc) || - atomic_read(&sk->sk_rmem_alloc)) { + if (sk_has_allocations(sk)) { sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; add_timer(&sk->sk_timer); } else @@ -175,8 +174,7 @@ static inline void atalk_destroy_socket(struct sock *sk) atalk_remove_socket(sk); skb_queue_purge(&sk->sk_receive_queue); - if (atomic_read(&sk->sk_wmem_alloc) || - atomic_read(&sk->sk_rmem_alloc)) { + if (sk_has_allocations(sk)) { setup_timer(&sk->sk_timer, atalk_destroy_timer, (unsigned long)sk); sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index fd9d06f291d..61b35b95549 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -330,8 +330,7 @@ void ax25_destroy_socket(ax25_cb *ax25) } if (ax25->sk != NULL) { - if (atomic_read(&ax25->sk->sk_wmem_alloc) || - atomic_read(&ax25->sk->sk_rmem_alloc)) { + if (sk_has_allocations(ax25->sk)) { /* Defer: outstanding buffers */ setup_timer(&ax25->dtimer, ax25_destroy_timer, (unsigned long)ax25); diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 8121bf0029e..2e1f836d424 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -540,8 +540,7 @@ static void econet_destroy_timer(unsigned long data) { struct sock *sk=(struct sock *)data; - if (!atomic_read(&sk->sk_wmem_alloc) && - !atomic_read(&sk->sk_rmem_alloc)) { + if (!sk_has_allocations(sk)) { sk_free(sk); return; } @@ -579,8 +578,7 @@ static int econet_release(struct socket *sock) skb_queue_purge(&sk->sk_receive_queue); - if (atomic_read(&sk->sk_rmem_alloc) || - atomic_read(&sk->sk_wmem_alloc)) { + if (sk_has_allocations(sk)) { sk->sk_timer.data = (unsigned long)sk; sk->sk_timer.expires = jiffies + HZ; sk->sk_timer.function = econet_destroy_timer; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 3be0e016ab7..cd911904cbe 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -286,8 +286,7 @@ void nr_destroy_socket(struct sock *sk) kfree_skb(skb); } - if (atomic_read(&sk->sk_wmem_alloc) || - atomic_read(&sk->sk_rmem_alloc)) { + if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.function = nr_destroy_timer; sk->sk_timer.expires = jiffies + 2 * HZ; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 877a7f65f70..4dd9a7d1894 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -356,8 +356,7 @@ void rose_destroy_socket(struct sock *sk) kfree_skb(skb); } - if (atomic_read(&sk->sk_wmem_alloc) || - atomic_read(&sk->sk_rmem_alloc)) { + if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ setup_timer(&sk->sk_timer, rose_destroy_timer, (unsigned long)sk); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index c51f3095739..8cd2390b0d4 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -372,8 +372,7 @@ static void __x25_destroy_socket(struct sock *sk) kfree_skb(skb); } - if (atomic_read(&sk->sk_wmem_alloc) || - atomic_read(&sk->sk_rmem_alloc)) { + if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; -- cgit v1.2.3-70-g09d2 From d55d87fdff8252d0e2f7c28c2d443aee17e9d70f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 22 Jun 2009 02:25:25 +0000 Subject: net: Move rx skb_orphan call to where needed In order to get the tun driver to account packets, we need to be able to receive packets with destructors set. To be on the safe side, I added an skb_orphan call for all protocols by default since some of them (IP in particular) cannot handle receiving packets destructors properly. Now it seems that at least one protocol (CAN) expects to be able to pass skb->sk through the rx path without getting clobbered. So this patch attempts to fix this properly by moving the skb_orphan call to where it's actually needed. In particular, I've added it to skb_set_owner_[rw] which is what most users of skb->destructor call. This is actually an improvement for tun too since it means that we only give back the amount charged to the socket when the skb is passed to another socket that will also be charged accordingly. Signed-off-by: Herbert Xu Tested-by: Oliver Hartkopp Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 1 + include/net/sock.h | 2 ++ net/ax25/ax25_in.c | 3 +-- net/core/dev.c | 2 -- net/irda/af_irda.c | 3 --- net/irda/ircomm/ircomm_lmp.c | 1 + 6 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 9f80a766828..d16a304cbed 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -448,6 +448,7 @@ static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { struct sctp_ulpevent *event = sctp_skb2event(skb); + skb_orphan(skb); skb->sk = sk; skb->destructor = sctp_sock_rfree; atomic_add(event->rmem_len, &sk->sk_rmem_alloc); diff --git a/include/net/sock.h b/include/net/sock.h index 570c7a12b54..7f5c41cc45a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1250,6 +1250,7 @@ static inline int sk_has_allocations(const struct sock *sk) static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { + skb_orphan(skb); skb->sk = sk; skb->destructor = sock_wfree; /* @@ -1262,6 +1263,7 @@ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { + skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 5f1d2107a1d..de56d3983de 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -437,8 +437,7 @@ free: int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { - skb->sk = NULL; /* Initially we don't know who it's for */ - skb->destructor = NULL; /* Who initializes this, dammit?! */ + skb_orphan(skb); if (!net_eq(dev_net(dev), &init_net)) { kfree_skb(skb); diff --git a/net/core/dev.c b/net/core/dev.c index baf2dc13a34..60b57281227 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2310,8 +2310,6 @@ ncls: if (!skb) goto out; - skb_orphan(skb); - type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 5922febe25c..cb762c8723e 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -913,9 +913,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) /* Clean up the original one to keep it in listen state */ irttp_listen(self->tsap); - /* Wow ! What is that ? Jean II */ - skb->sk = NULL; - skb->destructor = NULL; kfree_skb(skb); sk->sk_ack_backlog--; diff --git a/net/irda/ircomm/ircomm_lmp.c b/net/irda/ircomm/ircomm_lmp.c index 67c99d20857..7ba96618660 100644 --- a/net/irda/ircomm/ircomm_lmp.c +++ b/net/irda/ircomm/ircomm_lmp.c @@ -196,6 +196,7 @@ static int ircomm_lmp_data_request(struct ircomm_cb *self, /* Don't forget to refcount it - see ircomm_tty_do_softint() */ skb_get(skb); + skb_orphan(skb); skb->destructor = ircomm_lmp_flow_control; if ((self->pkt_count++ > 7) && (self->flow_status == FLOW_START)) { -- cgit v1.2.3-70-g09d2