From 6dcd814bd08bc7989f7f3eac9bbe8b20aec0182a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Aug 2010 07:04:14 +0000 Subject: net: struct xfrm_tunnel in read_mostly section tunnel4_handlers chain being scanned for each incoming packet, make sure it doesnt share an often dirtied cache line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/ipip.c') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ec036731a70..3c6f8f3968a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -744,7 +744,7 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) ipn->tunnels_wc[0] = tunnel; } -static struct xfrm_tunnel ipip_handler = { +static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, -- cgit v1.2.3-70-g09d2 From b7285b7912776a4492744949c747c88d539006fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Sep 2010 11:07:24 +0000 Subject: ipip: get rid of ipip_lock As RTNL is held while doing tunnels inserts and deletes, we can remove ipip_lock spinlock. My initial RCU conversion was conservative and converted the rwlock to spinlock, with no RTNL requirement. Use appropriate rcu annotations and modern lockdep checks as well. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 67 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 33 deletions(-) (limited to 'net/ipv4/ipip.c') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3c6f8f3968a..8de8888dc95 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -122,11 +122,11 @@ static int ipip_net_id __read_mostly; struct ipip_net { - struct ip_tunnel *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel *tunnels_r[HASH_SIZE]; - struct ip_tunnel *tunnels_l[HASH_SIZE]; - struct ip_tunnel *tunnels_wc[1]; - struct ip_tunnel **tunnels[4]; + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; @@ -135,9 +135,8 @@ static void ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ipip_lock); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) @@ -145,8 +144,8 @@ static DEFINE_SPINLOCK(ipip_lock); static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, __be32 remote, __be32 local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); struct ip_tunnel *t; struct ipip_net *ipn = net_generic(net, ipip_net_id); @@ -169,12 +168,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, +static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - unsigned h = 0; + unsigned int h = 0; int prio = 0; if (remote) { @@ -188,7 +187,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, return &ipn->tunnels[prio][h]; } -static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, +static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, struct ip_tunnel *t) { return __ipip_bucket(ipn, &t->parms); @@ -196,13 +195,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ipip_lock); - *tp = t->next; - spin_unlock_bh(&ipip_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip_bucket(ipn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -210,12 +210,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipip_bucket(ipn, t); + struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); - spin_lock_bh(&ipip_lock); - t->next = *tp; + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ipip_lock); } static struct ip_tunnel * ipip_tunnel_locate(struct net *net, @@ -223,12 +221,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - struct ip_tunnel *t, **tp, *nt; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct ipip_net *ipn = net_generic(net, ipip_net_id); - for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip_bucket(ipn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) return t; } @@ -268,16 +269,15 @@ failed_free: return NULL; } +/* called with RTNL */ static void ipip_tunnel_uninit(struct net_device *dev) { struct net *net = dev_net(dev); struct ipip_net *ipn = net_generic(net, ipip_net_id); - if (dev == ipn->fb_tunnel_dev) { - spin_lock_bh(&ipip_lock); - ipn->tunnels_wc[0] = NULL; - spin_unlock_bh(&ipip_lock); - } else + if (dev == ipn->fb_tunnel_dev) + rcu_assign_pointer(ipn->tunnels_wc[0], NULL); + else ipip_tunnel_unlink(ipn, netdev_priv(dev)); dev_put(dev); } @@ -741,7 +741,7 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; dev_hold(dev); - ipn->tunnels_wc[0] = tunnel; + rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); } static struct xfrm_tunnel ipip_handler __read_mostly = { @@ -760,11 +760,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t = ipn->tunnels[prio][h]; + struct ip_tunnel *t; + t = rtnl_dereference(ipn->tunnels[prio][h]); while (t != NULL) { unregister_netdevice_queue(t->dev, head); - t = t->next; + t = rtnl_dereference(t->next); } } } -- cgit v1.2.3-70-g09d2 From 8990f468ae9010ab0af4be8f51bf7ab833a67202 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Sep 2010 00:12:11 +0000 Subject: net: rx_dropped accounting Under load, netif_rx() can drop incoming packets but administrators dont have a chance to spot which device needs some tuning (RPS activation for example) This patch adds rx_dropped accounting in vlans and tunnels. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.h | 2 ++ net/8021q/vlan_dev.c | 9 +++++++-- net/ipv4/ip_gre.c | 6 ++++-- net/ipv4/ipip.c | 5 ++++- net/ipv6/ip6_tunnel.c | 5 ++++- net/ipv6/ip6mr.c | 4 +++- net/ipv6/sit.c | 5 ++++- 7 files changed, 28 insertions(+), 8 deletions(-) (limited to 'net/ipv4/ipip.c') diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 8d9503ad01d..b26ce343072 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -25,6 +25,7 @@ struct vlan_priority_tci_mapping { * @rx_multicast: number of received multicast packets * @syncp: synchronization point for 64bit counters * @rx_errors: number of errors + * @rx_dropped: number of dropped packets */ struct vlan_rx_stats { u64 rx_packets; @@ -32,6 +33,7 @@ struct vlan_rx_stats { u64 rx_multicast; struct u64_stats_sync syncp; unsigned long rx_errors; + unsigned long rx_dropped; }; /** diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 3bccdd12a26..94a1feddeb4 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -225,7 +225,10 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, } } - netif_rx(skb); + if (unlikely(netif_rx(skb) == NET_RX_DROP)) { + if (rx_stats) + rx_stats->rx_dropped++; + } rcu_read_unlock(); return NET_RX_SUCCESS; @@ -843,13 +846,15 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st accum.rx_packets += rxpackets; accum.rx_bytes += rxbytes; accum.rx_multicast += rxmulticast; - /* rx_errors is an ulong, not protected by syncp */ + /* rx_errors, rx_dropped are ulong, not protected by syncp */ accum.rx_errors += p->rx_errors; + accum.rx_dropped += p->rx_dropped; } stats->rx_packets = accum.rx_packets; stats->rx_bytes = accum.rx_bytes; stats->rx_errors = accum.rx_errors; stats->multicast = accum.rx_multicast; + stats->rx_dropped = accum.rx_dropped; } return stats; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fc20e687e93..714b6a80361 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -647,9 +647,11 @@ static int ipgre_rcv(struct sk_buff *skb) skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); - netif_rx(skb); + if (netif_rx(skb) == NET_RX_DROP) + stats->rx_dropped++; + rcu_read_unlock(); - return(0); + return 0; } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 8de8888dc95..babd2527810 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -377,7 +377,10 @@ static int ipip_rcv(struct sk_buff *skb) skb_tunnel_rx(skb, tunnel->dev); ipip_ecn_decapsulate(iph, skb); - netif_rx(skb); + + if (netif_rx(skb) == NET_RX_DROP) + tunnel->dev->stats.rx_dropped++; + rcu_read_unlock(); return 0; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9289cecac4d..f6d9f683543 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -725,7 +725,10 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, skb_tunnel_rx(skb, t->dev); dscp_ecn_decapsulate(t, ipv6h, skb); - netif_rx(skb); + + if (netif_rx(skb) == NET_RX_DROP) + t->dev->stats.rx_dropped++; + rcu_read_unlock(); return 0; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 66078dad7fe..2640c9be589 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -666,7 +666,9 @@ static int pim6_rcv(struct sk_buff *skb) skb_tunnel_rx(skb, reg_dev); - netif_rx(skb); + if (netif_rx(skb) == NET_RX_DROP) + reg_dev->stats.rx_dropped++; + dev_put(reg_dev); return 0; drop: diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 6822481ff76..8a039982223 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -564,7 +564,10 @@ static int ipip6_rcv(struct sk_buff *skb) skb_tunnel_rx(skb, tunnel->dev); ipip6_ecn_decapsulate(iph, skb); - netif_rx(skb); + + if (netif_rx(skb) == NET_RX_DROP) + tunnel->dev->stats.rx_dropped++; + rcu_read_unlock(); return 0; } -- cgit v1.2.3-70-g09d2 From 3c97af99a5aa17feaebb4eb0f85f51ab6c055797 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 00:35:50 +0000 Subject: ipip: percpu stats accounting Maintain per_cpu tx_bytes, tx_packets, rx_bytes, rx_packets. Other seldom used fields are kept in netdev->stats structure, possibly unsafe. This is a preliminary work to support lockless transmit path, and correct RX stats, that are already unsafe. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 127 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 93 insertions(+), 34 deletions(-) (limited to 'net/ipv4/ipip.c') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index babd2527810..12b6fde6f65 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -131,8 +131,9 @@ struct ipip_net { struct net_device *fb_tunnel_dev; }; -static void ipip_tunnel_init(struct net_device *dev); +static int ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); +static void ipip_dev_free(struct net_device *dev); /* * Locking : hash tables are protected by RCU and RTNL @@ -141,6 +142,34 @@ static void ipip_tunnel_setup(struct net_device *dev); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipip_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, __be32 remote, __be32 local) { @@ -239,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); else - sprintf(name, "tunl%%d"); + strcpy(name, "tunl%d"); dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); if (dev == NULL) @@ -255,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, nt = netdev_priv(dev); nt->parms = *parms; - ipip_tunnel_init(dev); + if (ipip_tunnel_init(dev) < 0) + goto failed_free; if (register_netdevice(dev) < 0) goto failed_free; @@ -265,7 +295,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, return nt; failed_free: - free_netdev(dev); + ipip_dev_free(dev); return NULL; } @@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); rcu_read_lock(); - if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), - iph->saddr, iph->daddr)) != NULL) { + tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); + if (tunnel != NULL) { + struct pcpu_tstats *tstats; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); kfree_skb(skb); @@ -374,7 +406,11 @@ static int ipip_rcv(struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; - skb_tunnel_rx(skb, tunnel->dev); + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); ipip_ecn_decapsulate(iph, skb); @@ -397,13 +433,12 @@ static int ipip_rcv(struct sk_buff *skb) static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); + struct pcpu_tstats *tstats; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ + struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ @@ -413,13 +448,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->protocol != htons(ETH_P_IP)) goto tx_error; - if (tos&1) + if (tos & 1) tos = old_iph->tos; if (!dst) { /* NBMA tunnel */ if ((rt = skb_rtable(skb)) == NULL) { - stats->tx_fifo_errors++; + dev->stats.tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway) == 0) @@ -427,14 +462,20 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = dst, - .saddr = tiph->saddr, - .tos = RT_TOS(tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error_icmp; } } @@ -442,7 +483,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tdev == dev) { ip_rt_put(rt); - stats->collisions++; + dev->stats.collisions++; goto tx_error; } @@ -452,7 +493,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { - stats->collisions++; + dev->stats.collisions++; ip_rt_put(rt); goto tx_error; } @@ -488,7 +529,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - txq->tx_dropped++; + dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -525,14 +566,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->ttl = old_iph->ttl; nf_reset(skb); - - IPTUNNEL_XMIT(); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: - stats->tx_errors++; + dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -547,13 +588,19 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) iph = &tunnel->parms.iph; if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; struct rtable *rt; + if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { tdev = rt->dst.dev; ip_rt_put(rt); @@ -699,13 +746,19 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ipip_tunnel_ioctl, .ndo_change_mtu = ipip_tunnel_change_mtu, - + .ndo_get_stats = ipip_get_stats, }; +static void ipip_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipip_dev_free; dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); @@ -717,7 +770,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } -static void ipip_tunnel_init(struct net_device *dev) +static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); @@ -728,6 +781,12 @@ static void ipip_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip_tunnel_bind_dev(dev); + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; } static void __net_init ipip_fb_tunnel_init(struct net_device *dev) -- cgit v1.2.3-70-g09d2 From fada5636fe41fd1423fe4e6af7b9f609378acde6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 23:56:46 +0000 Subject: ipip: fix percpu stats accounting commit 3c97af99a5aa1 (ipip: percpu stats accounting) forgot the fallback tunnel case (tunl0), and can crash pretty fast. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/ipv4/ipip.c') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 12b6fde6f65..9e78f11da78 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -789,7 +789,7 @@ static int ipip_tunnel_init(struct net_device *dev) return 0; } -static void __net_init ipip_fb_tunnel_init(struct net_device *dev) +static int __net_init ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -802,8 +802,13 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + dev_hold(dev); rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); + return 0; } static struct xfrm_tunnel ipip_handler __read_mostly = { @@ -852,7 +857,9 @@ static int __net_init ipip_init_net(struct net *net) } dev_net_set(ipn->fb_tunnel_dev, net); - ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + if (err) + goto err_reg_dev; if ((err = register_netdev(ipn->fb_tunnel_dev))) goto err_reg_dev; @@ -860,7 +867,7 @@ static int __net_init ipip_init_net(struct net *net) return 0; err_reg_dev: - free_netdev(ipn->fb_tunnel_dev); + ipip_dev_free(ipn->fb_tunnel_dev); err_alloc_dev: /* nothing */ return err; -- cgit v1.2.3-70-g09d2 From 153f0943382e9ae0bff7caa110a1a4656088d0d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Sep 2010 00:17:17 +0000 Subject: ipip: enable lockless xmits IPIP tunnels can benefit from lockless xmits, using NETIF_F_LLTX Bench on a 16 cpus machine (dual E5540 cpus), 16 threads sending 10000000 UDP frames via one ipip tunnel (size:200 bytes per frame) Before patch : real 2m53.321s user 0m10.277s sys 46m0.597s After patch: real 0m32.063s user 0m9.237s sys 8m16.255s Last problem to solve is the contention on dst : 16118.00 28.3% __ip_route_output_key vmlinux 6135.00 10.8% dst_release vmlinux 3220.00 5.6% ip_finish_output vmlinux 2149.00 3.8% ip_route_output_flow vmlinux 1575.00 2.8% ip_append_data vmlinux 1481.00 2.6% ip_push_pending_frames vmlinux 1349.00 2.4% __xfrm_lookup vmlinux 1216.00 2.1% csum_partial_copy_generic vmlinux 1208.00 2.1% udp_sendmsg vmlinux Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/ipip.c') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 9e78f11da78..6ad46c28ede 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -767,6 +767,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } -- cgit v1.2.3-70-g09d2 From caf586e5f23cebb2a68cbaf288d59dbbf2d74052 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Sep 2010 21:06:55 +0000 Subject: net: add a core netdev->rx_dropped counter In various situations, a device provides a packet to our stack and we drop it before it enters protocol stack : - softnet backlog full (accounted in /proc/net/softnet_stat) - bad vlan tag (not accounted) - unknown/unregistered protocol (not accounted) We can handle a per-device counter of such dropped frames at core level, and automatically adds it to the device provided stats (rx_dropped), so that standard tools can be used (ifconfig, ip link, cat /proc/net/dev) This is a generalization of commit 8990f468a (net: rx_dropped accounting), thus reverting it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/loopback.c | 8 +------- include/linux/netdevice.h | 3 +++ net/8021q/vlan.h | 2 -- net/8021q/vlan_core.c | 2 ++ net/8021q/vlan_dev.c | 11 ++++------- net/core/dev.c | 19 +++++++++++-------- net/ipv4/ip_gre.c | 3 +-- net/ipv4/ipip.c | 3 +-- net/ipv6/ip6_tunnel.c | 3 +-- net/ipv6/ip6mr.c | 3 +-- net/ipv6/sit.c | 3 +-- 11 files changed, 26 insertions(+), 34 deletions(-) (limited to 'net/ipv4/ipip.c') diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 4b0e30b564e..2d9663a1c54 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -64,7 +64,6 @@ struct pcpu_lstats { u64 packets; u64 bytes; struct u64_stats_sync syncp; - unsigned long drops; }; /* @@ -90,8 +89,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb, lb_stats->bytes += len; lb_stats->packets++; u64_stats_update_end(&lb_stats->syncp); - } else - lb_stats->drops++; + } return NETDEV_TX_OK; } @@ -101,7 +99,6 @@ static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev, { u64 bytes = 0; u64 packets = 0; - u64 drops = 0; int i; for_each_possible_cpu(i) { @@ -115,14 +112,11 @@ static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev, tbytes = lb_stats->bytes; tpackets = lb_stats->packets; } while (u64_stats_fetch_retry(&lb_stats->syncp, start)); - drops += lb_stats->drops; bytes += tbytes; packets += tpackets; } stats->rx_packets = packets; stats->tx_packets = packets; - stats->rx_dropped = drops; - stats->rx_errors = drops; stats->rx_bytes = bytes; stats->tx_bytes = bytes; return stats; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 92d81edd580..6abcef67b17 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -884,6 +884,9 @@ struct net_device { int iflink; struct net_device_stats stats; + atomic_long_t rx_dropped; /* dropped packets by core network + * Do not use this in drivers. + */ #ifdef CONFIG_WIRELESS_EXT /* List of functions to handle Wireless Extensions (instead of ioctl). diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index b26ce343072..8d9503ad01d 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -25,7 +25,6 @@ struct vlan_priority_tci_mapping { * @rx_multicast: number of received multicast packets * @syncp: synchronization point for 64bit counters * @rx_errors: number of errors - * @rx_dropped: number of dropped packets */ struct vlan_rx_stats { u64 rx_packets; @@ -33,7 +32,6 @@ struct vlan_rx_stats { u64 rx_multicast; struct u64_stats_sync syncp; unsigned long rx_errors; - unsigned long rx_dropped; }; /** diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index b6d55a9304f..dee727ce029 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -33,6 +33,7 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, return polling ? netif_receive_skb(skb) : netif_rx(skb); drop: + atomic_long_inc(&skb->dev->rx_dropped); dev_kfree_skb_any(skb); return NET_RX_DROP; } @@ -123,6 +124,7 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, return dev_gro_receive(napi, skb); drop: + atomic_long_inc(&skb->dev->rx_dropped); return GRO_DROP; } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f6fbcc0f1af..f54251edd40 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -225,16 +225,15 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, } } - if (unlikely(netif_rx(skb) == NET_RX_DROP)) { - if (rx_stats) - rx_stats->rx_dropped++; - } + netif_rx(skb); + rcu_read_unlock(); return NET_RX_SUCCESS; err_unlock: rcu_read_unlock(); err_free: + atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -846,15 +845,13 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st accum.rx_packets += rxpackets; accum.rx_bytes += rxbytes; accum.rx_multicast += rxmulticast; - /* rx_errors, rx_dropped are ulong, not protected by syncp */ + /* rx_errors is ulong, not protected by syncp */ accum.rx_errors += p->rx_errors; - accum.rx_dropped += p->rx_dropped; } stats->rx_packets = accum.rx_packets; stats->rx_bytes = accum.rx_bytes; stats->rx_errors = accum.rx_errors; stats->multicast = accum.rx_multicast; - stats->rx_dropped = accum.rx_dropped; } return stats; } diff --git a/net/core/dev.c b/net/core/dev.c index ce6ad88c980..7d149550e8d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1483,8 +1483,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); nf_reset(skb); - if (!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len))) { + if (unlikely(!(dev->flags & IFF_UP) || + (skb->len > (dev->mtu + dev->hard_header_len)))) { + atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -2548,6 +2549,7 @@ enqueue: local_irq_restore(flags); + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -2995,6 +2997,7 @@ ncls: if (pt_prev) { ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -5429,14 +5432,14 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); - return ops->ndo_get_stats64(dev, storage); - } - if (ops->ndo_get_stats) { + ops->ndo_get_stats64(dev, storage); + } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); - return storage; + } else { + netdev_stats_to_stats64(storage, &dev->stats); + dev_txq_stats_fold(dev, storage); } - netdev_stats_to_stats64(storage, &dev->stats); - dev_txq_stats_fold(dev, storage); + storage->rx_dropped += atomic_long_read(&dev->rx_dropped); return storage; } EXPORT_SYMBOL(dev_get_stats); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fbe2c473a06..9d421f4cf3e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -679,8 +679,7 @@ static int ipgre_rcv(struct sk_buff *skb) skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); - if (netif_rx(skb) == NET_RX_DROP) - tunnel->dev->stats.rx_dropped++; + netif_rx(skb); rcu_read_unlock(); return 0; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 6ad46c28ede..e9b816e6cd7 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -414,8 +414,7 @@ static int ipip_rcv(struct sk_buff *skb) ipip_ecn_decapsulate(iph, skb); - if (netif_rx(skb) == NET_RX_DROP) - tunnel->dev->stats.rx_dropped++; + netif_rx(skb); rcu_read_unlock(); return 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 8be3c452af9..c2c0f89397b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -768,8 +768,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, dscp_ecn_decapsulate(t, ipv6h, skb); - if (netif_rx(skb) == NET_RX_DROP) - t->dev->stats.rx_dropped++; + netif_rx(skb); rcu_read_unlock(); return 0; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 2640c9be589..6f32ffce702 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -666,8 +666,7 @@ static int pim6_rcv(struct sk_buff *skb) skb_tunnel_rx(skb, reg_dev); - if (netif_rx(skb) == NET_RX_DROP) - reg_dev->stats.rx_dropped++; + netif_rx(skb); dev_put(reg_dev); return 0; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d7701782b63..367a6cc584c 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -600,8 +600,7 @@ static int ipip6_rcv(struct sk_buff *skb) ipip6_ecn_decapsulate(iph, skb); - if (netif_rx(skb) == NET_RX_DROP) - tunnel->dev->stats.rx_dropped++; + netif_rx(skb); rcu_read_unlock(); return 0; -- cgit v1.2.3-70-g09d2 From 74b0b85b88aaa952023762e0280799aaae849841 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 27 Oct 2010 05:43:53 +0000 Subject: tunnels: Fix tunnels change rcu protection After making rcu protection for tunnels (ipip, gre, sit and ip6) a bug was introduced into the SIOCCHGTUNNEL code. The tunnel is first unlinked, then addresses change, then it is linked back probably into another bucket. But while changing the parms, the hash table is unlocked to readers and they can lookup the improper tunnel. Respective commits are b7285b79 (ipip: get rid of ipip_lock), 1507850b (gre: get rid of ipgre_lock), 3a43be3c (sit: get rid of ipip6_lock) and 94767632 (ip6tnl: get rid of ip6_tnl_lock). The quick fix is to wait for quiescent state to pass after unlinking, but if it is inappropriate I can invent something better, just let me know. Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + net/ipv4/ipip.c | 1 + net/ipv6/ip6_tunnel.c | 1 + net/ipv6/sit.c | 1 + 4 files changed, 4 insertions(+) (limited to 'net/ipv4/ipip.c') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d0ffcbe369b..01087e035b7 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1072,6 +1072,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } ipgre_tunnel_unlink(ign, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; t->parms.i_key = p.i_key; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e9b816e6cd7..cd300aaee78 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -676,6 +676,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) } t = netdev_priv(dev); ipip_tunnel_unlink(ipn, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 38b9a56c173..2a59610c2a5 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1284,6 +1284,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) t = netdev_priv(dev); ip6_tnl_unlink(ip6n, t); + synchronize_net(); err = ip6_tnl_change(t, &p); ip6_tnl_link(ip6n, t); netdev_state_change(dev); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 367a6cc584c..d6bfaec3bbb 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -963,6 +963,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) } t = netdev_priv(dev); ipip6_tunnel_unlink(sitn, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); -- cgit v1.2.3-70-g09d2