From fec5e652e58fa6017b2c9e06466cb2a6538de5b4 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 16 Apr 2010 16:01:27 -0700 Subject: rfs: Receive Flow Steering This patch implements receive flow steering (RFS). RFS steers received packets for layer 3 and 4 processing to the CPU where the application for the corresponding flow is running. RFS is an extension of Receive Packet Steering (RPS). The basic idea of RFS is that when an application calls recvmsg (or sendmsg) the application's running CPU is stored in a hash table that is indexed by the connection's rxhash which is stored in the socket structure. The rxhash is passed in skb's received on the connection from netif_receive_skb. For each received packet, the associated rxhash is used to look up the CPU in the hash table, if a valid CPU is set then the packet is steered to that CPU using the RPS mechanisms. The convolution of the simple approach is that it would potentially allow OOO packets. If threads are thrashing around CPUs or multiple threads are trying to read from the same sockets, a quickly changing CPU value in the hash table could cause rampant OOO packets-- we consider this a non-starter. To avoid OOO packets, this solution implements two types of hash tables: rps_sock_flow_table and rps_dev_flow_table. rps_sock_table is a global hash table. Each entry is just a CPU number and it is populated in recvmsg and sendmsg as described above. This table contains the "desired" CPUs for flows. rps_dev_flow_table is specific to each device queue. Each entry contains a CPU and a tail queue counter. The CPU is the "current" CPU for a matching flow. The tail queue counter holds the value of a tail queue counter for the associated CPU's backlog queue at the time of last enqueue for a flow matching the entry. Each backlog queue has a queue head counter which is incremented on dequeue, and so a queue tail counter is computed as queue head count + queue length. When a packet is enqueued on a backlog queue, the current value of the queue tail counter is saved in the hash entry of the rps_dev_flow_table. And now the trick: when selecting the CPU for RPS (get_rps_cpu) the rps_sock_flow table and the rps_dev_flow table for the RX queue are consulted. When the desired CPU for the flow (found in the rps_sock_flow table) does not match the current CPU (found in the rps_dev_flow table), the current CPU is changed to the desired CPU if one of the following is true: - The current CPU is unset (equal to RPS_NO_CPU) - Current CPU is offline - The current CPU's queue head counter >= queue tail counter in the rps_dev_flow table. This checks if the queue tail has advanced beyond the last packet that was enqueued using this table entry. This guarantees that all packets queued using this entry have been dequeued, thus preserving in order delivery. Making each queue have its own rps_dev_flow table has two advantages: 1) the tail queue counters will be written on each receive, so keeping the table local to interrupting CPU s good for locality. 2) this allows lockless access to the table-- the CPU number and queue tail counter need to be accessed together under mutual exclusion from netif_receive_skb, we assume that this is only called from device napi_poll which is non-reentrant. This patch implements RFS for TCP and connected UDP sockets. It should be usable for other flow oriented protocols. There are two configuration parameters for RFS. The "rps_flow_entries" kernel init parameter sets the number of entries in the rps_sock_flow_table, the per rxqueue sysfs entry "rps_flow_cnt" contains the number of entries in the rps_dev_flow table for the rxqueue. Both are rounded to power of two. The obvious benefit of RFS (over just RPS) is that it achieves CPU locality between the receive processing for a flow and the applications processing; this can result in increased performance (higher pps, lower latency). The benefits of RFS are dependent on cache hierarchy, application load, and other factors. On simple benchmarks, we don't necessarily see improvement and sometimes see degradation. However, for more complex benchmarks and for applications where cache pressure is much higher this technique seems to perform very well. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. The RPC test is an request/response test similar in structure to netperf RR test ith 100 threads on each host, but does more work in userspace that netperf. e1000e on 8 core Intel No RFS or RPS 104K tps at 30% CPU No RFS (best RPS config): 290K tps at 63% CPU RFS 303K tps at 61% CPU RPC test tps CPU% 50/90/99% usec latency Latency StdDev No RFS/RPS 103K 48% 757/900/3185 4472.35 RPS only: 174K 73% 415/993/2468 491.66 RFS 223K 73% 379/651/1382 315.61 Signed-off-by: Tom Herbert Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8fef859db35..666b963496f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1217,6 +1217,7 @@ int udp_disconnect(struct sock *sk, int flags) sk->sk_state = TCP_CLOSE; inet->inet_daddr = 0; inet->inet_dport = 0; + inet_rps_save_rxhash(sk, 0); sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); @@ -1258,8 +1259,12 @@ EXPORT_SYMBOL(udp_lib_unhash); static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int rc = sock_queue_rcv_skb(sk, skb); + int rc; + + if (inet_sk(sk)->inet_daddr) + inet_rps_save_rxhash(sk, skb->rxhash); + rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); -- cgit v1.2.3-70-g09d2 From 0eae88f31ca2b88911ce843452054139e028771f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 19:06:52 -0700 Subject: net: Fix various endianness glitches Sparse can help us find endianness bugs, but we need to make some cleanups to be able to more easily spot real bugs. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- net/bridge/br_private.h | 15 ++++++++------- net/ethernet/eth.c | 2 +- net/ipv4/af_inet.c | 8 ++++---- net/ipv4/ipmr.c | 10 +++++----- net/ipv4/route.c | 29 ++++++++++++++--------------- net/ipv4/tcp.c | 15 ++++++++------- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/udp.c | 8 ++++---- net/ipv6/addrconf.c | 3 ++- net/ipv6/ip6_fib.c | 3 ++- net/ipv6/tcp_ipv6.c | 4 ++-- net/ipv6/udp.c | 4 ++-- net/sched/sch_sfq.c | 10 +++++----- net/sunrpc/xprt.c | 2 +- net/xfrm/xfrm_hash.h | 3 ++- 17 files changed, 65 insertions(+), 61 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 3fe86ffc069..61e1d1094b8 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -29,7 +29,7 @@ static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb, __be32 ip) { - return jhash_1word(mdb->secret, (u32)ip) & (mdb->max - 1); + return jhash_1word(mdb->secret, (__force u32)ip) & (mdb->max - 1); } static struct net_bridge_mdb_entry *__br_mdb_ip_get( diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 791d4ab0fd4..63181e4a2a6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -130,19 +130,20 @@ struct net_bridge_port #endif }; +struct br_cpu_netstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + struct net_bridge { spinlock_t lock; struct list_head port_list; struct net_device *dev; - struct br_cpu_netstats __percpu { - unsigned long rx_packets; - unsigned long rx_bytes; - unsigned long tx_packets; - unsigned long tx_bytes; - } *stats; - + struct br_cpu_netstats __percpu *stats; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; unsigned long feature_mask; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 205a1c12f3c..35846964082 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -136,7 +136,7 @@ int eth_rebuild_header(struct sk_buff *skb) default: printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n", - dev->name, (int)eth->h_proto); + dev->name, (__force int)eth->h_proto); memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); break; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5ca7290c2e6..9f52880fae1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1323,8 +1323,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto out_unlock; - id = ntohl(*(u32 *)&iph->id); - flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); + id = ntohl(*(__be32 *)&iph->id); + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); id >>= 16; for (p = *head; p; p = p->next) { @@ -1337,8 +1337,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if ((iph->protocol ^ iph2->protocol) | (iph->tos ^ iph2->tos) | - (iph->saddr ^ iph2->saddr) | - (iph->daddr ^ iph2->daddr)) { + ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | + ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 7d8a2bcecb7..a2df5012a1d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1772,10 +1772,10 @@ int ip_mr_input(struct sk_buff *skb) vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) { - int err = ipmr_cache_unresolved(mrt, vif, skb); + int err2 = ipmr_cache_unresolved(mrt, vif, skb); read_unlock(&mrt_lock); - return err; + return err2; } read_unlock(&mrt_lock); kfree_skb(skb); @@ -2227,9 +2227,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) const struct ipmr_mfc_iter *it = seq->private; const struct mr_table *mrt = it->mrt; - seq_printf(seq, "%08lX %08lX %-3hd", - (unsigned long) mfc->mfc_mcastgrp, - (unsigned long) mfc->mfc_origin, + seq_printf(seq, "%08X %08X %-3hd", + (__force u32) mfc->mfc_mcastgrp, + (__force u32) mfc->mfc_origin, mfc->mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cb562fdd9b9..a947428ef0a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -258,10 +258,9 @@ static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); (__raw_get_cpu_var(rt_cache_stat).field++) static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, - int genid) + int genid) { - return jhash_3words((__force u32)(__be32)(daddr), - (__force u32)(__be32)(saddr), + return jhash_3words((__force u32)daddr, (__force u32)saddr, idx, genid) & rt_hash_mask; } @@ -378,12 +377,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) struct rtable *r = v; int len; - seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" - "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", + seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" + "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", r->u.dst.dev ? r->u.dst.dev->name : "*", - (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, + (__force u32)r->rt_dst, + (__force u32)r->rt_gateway, r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (unsigned long)r->rt_src, + r->u.dst.__use, 0, (__force u32)r->rt_src, (dst_metric(&r->u.dst, RTAX_ADVMSS) ? (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), dst_metric(&r->u.dst, RTAX_WINDOW), @@ -685,18 +685,17 @@ static inline bool rt_caching(const struct net *net) static inline bool compare_hash_inputs(const struct flowi *fl1, const struct flowi *fl2) { - return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | + return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | + ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | (fl1->iif ^ fl2->iif)) == 0); } static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { - return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | + return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | + ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | (fl1->mark ^ fl2->mark) | - (*(u16 *)&fl1->nl_u.ip4_u.tos ^ - *(u16 *)&fl2->nl_u.ip4_u.tos) | + (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | (fl1->oif ^ fl2->oif) | (fl1->iif ^ fl2->iif)) == 0; } @@ -2319,8 +2318,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.dst.rt_next)) { - if (((rth->fl.fl4_dst ^ daddr) | - (rth->fl.fl4_src ^ saddr) | + if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | + ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 77208334a61..6689c61cab4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2721,7 +2721,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct tcphdr *th2; unsigned int len; unsigned int thlen; - unsigned int flags; + __be32 flags; unsigned int mss = 1; unsigned int hlen; unsigned int off; @@ -2771,10 +2771,10 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) found: flush = NAPI_GRO_CB(p)->flush; - flush |= flags & TCP_FLAG_CWR; - flush |= (flags ^ tcp_flag_word(th2)) & - ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); - flush |= th->ack_seq ^ th2->ack_seq; + flush |= (__force int)(flags & TCP_FLAG_CWR); + flush |= (__force int)((flags ^ tcp_flag_word(th2)) & + ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); + flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); @@ -2795,8 +2795,9 @@ found: out_check_final: flush = len < mss; - flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | - TCP_FLAG_SYN | TCP_FLAG_FIN); + flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | + TCP_FLAG_RST | TCP_FLAG_SYN | + TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) pp = head; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad08392a738..4d6717d1e61 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1286,8 +1286,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_release; /* Secret recipe starts with IP addresses */ - *mess++ ^= daddr; - *mess++ ^= saddr; + *mess++ ^= (__force u32)daddr; + *mess++ ^= (__force u32)saddr; /* plus variable length Initiator Cookie */ c = (u8 *)mess; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2b7d71fb843..429ad9286ef 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -861,7 +861,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { - th->urg_ptr = 0xFFFF; + th->urg_ptr = htons(0xFFFF); th->urg = 1; } } @@ -2485,7 +2485,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, *tail-- ^= TCP_SKB_CB(skb)->seq + 1; /* recommended */ - *tail-- ^= ((th->dest << 16) | th->source); + *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ sha_transform((__u32 *)&xvp->cookie_bakery[0], diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 666b963496f..1e18f9cc924 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -307,13 +307,13 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, unsigned int port) { - return jhash_1word(saddr, net_hash_mix(net)) ^ port; + return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum); + udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); @@ -466,14 +466,14 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, daddr, hnum, dif, hslot2, slot2); if (!result) { - hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum); + hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) goto begin; result = udp4_lib_lookup2(net, saddr, sport, - INADDR_ANY, hnum, dif, + htonl(INADDR_ANY), hnum, dif, hslot2, slot2); } rcu_read_unlock(); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7cba8845242..34d2d649e39 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -588,7 +588,8 @@ static u32 ipv6_addr_hash(const struct in6_addr *addr) * We perform the hash function over the last 64 bits of the address * This will include the IEEE address token on links that support it. */ - return jhash_2words(addr->s6_addr32[2], addr->s6_addr32[3], 0) + return jhash_2words((__force u32)addr->s6_addr32[2], + (__force u32)addr->s6_addr32[3], 0) & (IN6_ADDR_HSIZE - 1); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index dc6e0b8f260..92a122b7795 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -144,7 +144,8 @@ static __inline__ __be32 addr_bit_set(void *token, int fn_bit) * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ - return (1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; + return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & + addr[fn_bit >> 5]; } static __inline__ struct fib6_node * node_alloc(void) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bd5ef7b6e48..a92b4a5cd8b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1234,12 +1234,12 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; /* Secret recipe starts with IP addresses */ - d = &ipv6_hdr(skb)->daddr.s6_addr32[0]; + d = (__force u32 *)&ipv6_hdr(skb)->daddr.s6_addr32[0]; *mess++ ^= *d++; *mess++ ^= *d++; *mess++ ^= *d++; *mess++ ^= *d++; - d = &ipv6_hdr(skb)->saddr.s6_addr32[0]; + d = (__force u32 *)&ipv6_hdr(skb)->saddr.s6_addr32[0]; *mess++ ^= *d++; *mess++ ^= *d++; *mess++ ^= *d++; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 90824852f59..92bf9033e24 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -91,9 +91,9 @@ static unsigned int udp6_portaddr_hash(struct net *net, if (ipv6_addr_any(addr6)) hash = jhash_1word(0, mix); else if (ipv6_addr_v4mapped(addr6)) - hash = jhash_1word(addr6->s6_addr32[3], mix); + hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); else - hash = jhash2(addr6->s6_addr32, 4, mix); + hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); return hash ^ port; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index c5a9ac56600..c65762823f5 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -123,8 +123,8 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) case htons(ETH_P_IP): { const struct iphdr *iph = ip_hdr(skb); - h = iph->daddr; - h2 = iph->saddr ^ iph->protocol; + h = (__force u32)iph->daddr; + h2 = (__force u32)iph->saddr ^ iph->protocol; if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP || @@ -138,8 +138,8 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) case htons(ETH_P_IPV6): { struct ipv6hdr *iph = ipv6_hdr(skb); - h = iph->daddr.s6_addr32[3]; - h2 = iph->saddr.s6_addr32[3] ^ iph->nexthdr; + h = (__force u32)iph->daddr.s6_addr32[3]; + h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; if (iph->nexthdr == IPPROTO_TCP || iph->nexthdr == IPPROTO_UDP || iph->nexthdr == IPPROTO_UDPLITE || @@ -150,7 +150,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) break; } default: - h = (unsigned long)skb_dst(skb) ^ skb->protocol; + h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol; h2 = (unsigned long)skb->sk; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 42f09ade004..699ade68aac 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -974,7 +974,7 @@ void xprt_reserve(struct rpc_task *task) static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) { - return xprt->xid++; + return (__force __be32)xprt->xid++; } static inline void xprt_init_xid(struct rpc_xprt *xprt) diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index e5195c99f71..1396572d2ad 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -16,7 +16,8 @@ static inline unsigned int __xfrm6_addr_hash(xfrm_address_t *addr) static inline unsigned int __xfrm4_daddr_saddr_hash(xfrm_address_t *daddr, xfrm_address_t *saddr) { - return ntohl(daddr->a4 + saddr->a4); + u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4; + return ntohl((__force __be32)sum); } static inline unsigned int __xfrm6_daddr_saddr_hash(xfrm_address_t *daddr, xfrm_address_t *saddr) -- cgit v1.2.3-70-g09d2 From c58dc01babfd58ec9e71a6ce080150dc27755d88 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Apr 2010 15:05:31 -0700 Subject: net: Make RFS socket operations not be inet specific. Idea from Eric Dumazet. As for placement inside of struct sock, I tried to choose a place that otherwise has a 32-bit hole on 64-bit systems. Signed-off-by: David S. Miller Acked-by: Eric Dumazet --- include/net/inet_sock.h | 37 ------------------------------------- include/net/sock.h | 38 ++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 8 ++++---- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 4 ++-- 5 files changed, 45 insertions(+), 44 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index c1d42957b86..1653de515ce 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -102,7 +102,6 @@ struct rtable; * @uc_ttl - Unicast TTL * @inet_sport - Source port * @inet_id - ID counter for DF pkts - * @rxhash - flow hash received from netif layer * @tos - TOS * @mc_ttl - Multicasting TTL * @is_icsk - is this an inet_connection_sock? @@ -126,9 +125,6 @@ struct inet_sock { __u16 cmsg_flags; __be16 inet_sport; __u16 inet_id; -#ifdef CONFIG_RPS - __u32 rxhash; -#endif struct ip_options *opt; __u8 tos; @@ -224,37 +220,4 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) return inet_sk(sk)->transparent ? FLOWI_FLAG_ANYSRC : 0; } -static inline void inet_rps_record_flow(const struct sock *sk) -{ -#ifdef CONFIG_RPS - struct rps_sock_flow_table *sock_flow_table; - - rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); - rps_record_sock_flow(sock_flow_table, inet_sk(sk)->rxhash); - rcu_read_unlock(); -#endif -} - -static inline void inet_rps_reset_flow(const struct sock *sk) -{ -#ifdef CONFIG_RPS - struct rps_sock_flow_table *sock_flow_table; - - rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); - rps_reset_sock_flow(sock_flow_table, inet_sk(sk)->rxhash); - rcu_read_unlock(); -#endif -} - -static inline void inet_rps_save_rxhash(struct sock *sk, u32 rxhash) -{ -#ifdef CONFIG_RPS - if (unlikely(inet_sk(sk)->rxhash != rxhash)) { - inet_rps_reset_flow(sk); - inet_sk(sk)->rxhash = rxhash; - } -#endif -} #endif /* _INET_SOCK_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 4081db86a35..07822280d95 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -198,6 +198,7 @@ struct sock_common { * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting + * @sk_rxhash: flow hash received from netif layer * @sk_filter: socket filtering instructions * @sk_protinfo: private area, net family specific, when not using slab * @sk_timer: sock cleanup timer @@ -279,6 +280,9 @@ struct sock { int sk_gso_type; unsigned int sk_gso_max_size; int sk_rcvlowat; +#ifdef CONFIG_RPS + __u32 sk_rxhash; +#endif unsigned long sk_flags; unsigned long sk_lingertime; struct sk_buff_head sk_error_queue; @@ -620,6 +624,40 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) return sk->sk_backlog_rcv(sk, skb); } +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + rcu_read_lock(); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + rps_record_sock_flow(sock_flow_table, sk->sk_rxhash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_reset_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + rcu_read_lock(); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash) +{ +#ifdef CONFIG_RPS + if (unlikely(sk->sk_rxhash != rxhash)) { + sock_rps_reset_flow(sk); + sk->sk_rxhash = rxhash; + } +#endif +} + #define sk_wait_event(__sk, __timeo, __condition) \ ({ int __rc; \ release_sock(__sk); \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9f52880fae1..c6c43bcd1c6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -419,7 +419,7 @@ int inet_release(struct socket *sock) if (sk) { long timeout; - inet_rps_reset_flow(sk); + sock_rps_reset_flow(sk); /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -722,7 +722,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; - inet_rps_record_flow(sk); + sock_rps_record_flow(sk); /* We may need to bind the socket. */ if (!inet_sk(sk)->inet_num && inet_autobind(sk)) @@ -737,7 +737,7 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, { struct sock *sk = sock->sk; - inet_rps_record_flow(sk); + sock_rps_record_flow(sk); /* We may need to bind the socket. */ if (!inet_sk(sk)->inet_num && inet_autobind(sk)) @@ -755,7 +755,7 @@ int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, int addr_len = 0; int err; - inet_rps_record_flow(sk); + sock_rps_record_flow(sk); err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4d6717d1e61..771f8146a2e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1672,7 +1672,7 @@ process: skb->dev = NULL; - inet_rps_save_rxhash(sk, skb->rxhash); + sock_rps_save_rxhash(sk, skb->rxhash); bh_lock_sock_nested(sk); ret = 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1e18f9cc924..fa3d2874db4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1217,7 +1217,7 @@ int udp_disconnect(struct sock *sk, int flags) sk->sk_state = TCP_CLOSE; inet->inet_daddr = 0; inet->inet_dport = 0; - inet_rps_save_rxhash(sk, 0); + sock_rps_save_rxhash(sk, 0); sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); @@ -1262,7 +1262,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) int rc; if (inet_sk(sk)->inet_daddr) - inet_rps_save_rxhash(sk, skb->rxhash); + sock_rps_save_rxhash(sk, skb->rxhash); rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) { -- cgit v1.2.3-70-g09d2 From c377411f2494a931ff7facdbb3a6839b1266bcf6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Apr 2010 15:13:20 -0700 Subject: net: sk_add_backlog() take rmem_alloc into account Current socket backlog limit is not enough to really stop DDOS attacks, because user thread spend many time to process a full backlog each round, and user might crazy spin on socket lock. We should add backlog size and receive_queue size (aka rmem_alloc) to pace writers, and let user run without being slow down too much. Introduce a sk_rcvqueues_full() helper, to avoid taking socket lock in stress situations. Under huge stress from a multiqueue/RPS enabled NIC, a single flow udp receiver can now process ~200.000 pps (instead of ~100 pps before the patch) on a 8 core machine. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 13 +++++++++++-- net/core/sock.c | 5 ++++- net/ipv4/udp.c | 4 ++++ net/ipv6/udp.c | 8 ++++++++ net/sctp/socket.c | 3 --- 5 files changed, 27 insertions(+), 6 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/include/net/sock.h b/include/net/sock.h index 07822280d95..cf12b1e61fa 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -256,7 +256,6 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; int len; - int limit; } sk_backlog; wait_queue_head_t *sk_sleep; struct dst_entry *sk_dst_cache; @@ -608,10 +607,20 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) skb->next = NULL; } +/* + * Take into account size of receive queue and backlog queue + */ +static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) +{ + unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); + + return qsize + skb->truesize > sk->sk_rcvbuf; +} + /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb) { - if (sk->sk_backlog.len >= max(sk->sk_backlog.limit, sk->sk_rcvbuf << 1)) + if (sk_rcvqueues_full(sk, skb)) return -ENOBUFS; __sk_add_backlog(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 58ebd146ce5..51041759517 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -327,6 +327,10 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) skb->dev = NULL; + if (sk_rcvqueues_full(sk, skb)) { + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } if (nested) bh_lock_sock_nested(sk); else @@ -1885,7 +1889,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; - sk->sk_backlog.limit = sk->sk_rcvbuf << 1; sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fa3d2874db4..63eb56b2d87 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1372,6 +1372,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto drop; } + + if (sk_rcvqueues_full(sk, skb)) + goto drop; + rc = 0; bh_lock_sock(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2850e35cee3..3ead20ad9d0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -584,6 +584,10 @@ static void flush_stack(struct sock **stack, unsigned int count, sk = stack[i]; if (skb1) { + if (sk_rcvqueues_full(sk, skb)) { + kfree_skb(skb1); + goto drop; + } bh_lock_sock(sk); if (!sock_owned_by_user(sk)) udpv6_queue_rcv_skb(sk, skb1); @@ -759,6 +763,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, /* deliver */ + if (sk_rcvqueues_full(sk, skb)) { + sock_put(sk); + goto discard; + } bh_lock_sock(sk); if (!sock_owned_by_user(sk)) udpv6_queue_rcv_skb(sk, skb); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f34adcca8a8..13d8229f3a9 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3721,9 +3721,6 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) SCTP_DBG_OBJCNT_INC(sock); percpu_counter_inc(&sctp_sockets_allocated); - /* Set socket backlog limit. */ - sk->sk_backlog.limit = sysctl_sctp_rmem[1]; - local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); local_bh_enable(); -- cgit v1.2.3-70-g09d2 From 4b0b72f7dd617b13abd1b04c947e15873e011a24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Apr 2010 14:35:48 -0700 Subject: net: speedup udp receive path Since commit 95766fff ([UDP]: Add memory accounting.), each received packet needs one extra sock_lock()/sock_release() pair. This added latency because of possible backlog handling. Then later, ticket spinlocks added yet another latency source in case of DDOS. This patch introduces lock_sock_bh() and unlock_sock_bh() synchronization primitives, avoiding one atomic operation and backlog processing. skb_free_datagram_locked() uses them instead of full blown lock_sock()/release_sock(). skb is orphaned inside locked section for proper socket memory reclaim, and finally freed outside of it. UDP receive path now take the socket spinlock only once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ net/core/datagram.c | 10 +++++++--- net/ipv4/udp.c | 12 ++++++------ net/ipv6/udp.c | 4 ++-- 4 files changed, 25 insertions(+), 11 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/include/net/sock.h b/include/net/sock.h index cf12b1e61fa..d361c7769fe 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1021,6 +1021,16 @@ extern void release_sock(struct sock *sk); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) +static inline void lock_sock_bh(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); +} + +static inline void unlock_sock_bh(struct sock *sk) +{ + spin_unlock_bh(&sk->sk_lock.slock); +} + extern struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot); diff --git a/net/core/datagram.c b/net/core/datagram.c index 5574a5ddf90..95b851f3d71 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -229,9 +229,13 @@ EXPORT_SYMBOL(skb_free_datagram); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) { - lock_sock(sk); - skb_free_datagram(sk, skb); - release_sock(sk); + lock_sock_bh(sk); + skb_orphan(skb); + sk_mem_reclaim_partial(sk); + unlock_sock_bh(sk); + + /* skb is now orphaned, might be freed outside of locked section */ + consume_skb(skb); } EXPORT_SYMBOL(skb_free_datagram_locked); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 63eb56b2d87..1f86965ba7d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1062,10 +1062,10 @@ static unsigned int first_packet_length(struct sock *sk) spin_unlock_bh(&rcvq->lock); if (!skb_queue_empty(&list_kill)) { - lock_sock(sk); + lock_sock_bh(sk); __skb_queue_purge(&list_kill); sk_mem_reclaim_partial(sk); - release_sock(sk); + unlock_sock_bh(sk); } return res; } @@ -1196,10 +1196,10 @@ out: return err; csum_copy_err: - lock_sock(sk); + lock_sock_bh(sk); if (!skb_kill_datagram(sk, skb, flags)) UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - release_sock(sk); + unlock_sock_bh(sk); if (noblock) return -EAGAIN; @@ -1624,9 +1624,9 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { - lock_sock(sk); + lock_sock_bh(sk); udp_flush_pending_frames(sk); - release_sock(sk); + unlock_sock_bh(sk); } /* diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3ead20ad9d0..91c60f0090a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -424,7 +424,7 @@ out: return err; csum_copy_err: - lock_sock(sk); + lock_sock_bh(sk); if (!skb_kill_datagram(sk, skb, flags)) { if (is_udp4) UDP_INC_STATS_USER(sock_net(sk), @@ -433,7 +433,7 @@ csum_copy_err: UDP6_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - release_sock(sk); + unlock_sock_bh(sk); if (flags & MSG_DONTWAIT) return -EAGAIN; -- cgit v1.2.3-70-g09d2 From f84af32cbca70a3c6d30463dc08c7984af11c277 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Apr 2010 15:31:51 -0700 Subject: net: ip_queue_rcv_skb() helper When queueing a skb to socket, we can immediately release its dst if target socket do not use IP_CMSG_PKTINFO. tcp_data_queue() can drop dst too. This to benefit from a hot cache line and avoid the receiver, possibly on another cpu, to dirty this cache line himself. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 16 ++++++++++++++++ net/ipv4/raw.c | 2 +- net/ipv4/tcp_input.c | 1 + net/ipv4/udp.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- 7 files changed, 22 insertions(+), 4 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/include/net/ip.h b/include/net/ip.h index a84ceb69268..8149b77cea9 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -393,6 +393,7 @@ extern int ip_options_rcv_srr(struct sk_buff *skb); * Functions provided by ip_sockglue.c */ +extern int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); extern void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb); extern int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b0aa0546a3b..ce231780a2b 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -954,6 +954,22 @@ e_inval: return -EINVAL; } +/** + * ip_queue_rcv_skb - Queue an skb into sock receive queue + * @sk: socket + * @skb: buffer + * + * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option + * is not set, we drop skb dst entry now, while dst cache line is hot. + */ +int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO)) + skb_dst_drop(skb); + return sock_queue_rcv_skb(sk, skb); +} +EXPORT_SYMBOL(ip_queue_rcv_skb); + int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index cc6f097fbd5..52ef5af78a4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -290,7 +290,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ - if (sock_queue_rcv_skb(sk, skb) < 0) { + if (ip_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ae3ec15fb63..e82162c211b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4367,6 +4367,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; + skb_dst_drop(skb); __skb_pull(skb, th->doff * 4); TCP_ECN_accept_cwr(tp, skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1f86965ba7d..4560b291180 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1264,7 +1264,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (inet_sk(sk)->inet_daddr) sock_rps_save_rxhash(sk, skb->rxhash); - rc = sock_queue_rcv_skb(sk, skb); + rc = ip_queue_rcv_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 85627386cb0..0e3d2dd9207 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -381,7 +381,7 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) } /* Charge it to the socket. */ - if (sock_queue_rcv_skb(sk, skb) < 0) { + if (ip_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 91c60f0090a..79359c8380b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -514,7 +514,7 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) goto drop; } - if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) { + if ((rc = ip_queue_rcv_skb(sk, skb)) < 0) { /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) UDP6_INC_STATS_BH(sock_net(sk), -- cgit v1.2.3-70-g09d2 From e3826f1e946e7d2354943232f1457be1455a29e2 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Wed, 5 May 2010 00:27:06 +0000 Subject: net: reserve ports for applications using fixed port numbers (Dropped the infiniband part, because Tetsuo modified the related code, I will send a separate patch for it once this is accepted.) This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which allows users to reserve ports for third-party applications. The reserved ports will not be used by automatic port assignments (e.g. when calling connect() or bind() with port number 0). Explicit port allocation behavior is unchanged. Signed-off-by: Octavian Purdila Signed-off-by: WANG Cong Cc: Neil Horman Cc: Eric Dumazet Cc: Eric W. Biederman Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 31 +++++++++++++++++++++++++++++++ include/net/ip.h | 6 ++++++ net/ipv4/af_inet.c | 8 +++++++- net/ipv4/inet_connection_sock.c | 6 ++++++ net/ipv4/inet_hashtables.c | 2 ++ net/ipv4/sysctl_net_ipv4.c | 17 +++++++++++++++++ net/ipv4/udp.c | 3 ++- net/sctp/socket.c | 2 ++ 8 files changed, 73 insertions(+), 2 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 8b72c88ba21..d0536b5a4e0 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -588,6 +588,37 @@ ip_local_port_range - 2 INTEGERS (i.e. by default) range 1024-4999 is enough to issue up to 2000 connections per second to systems supporting timestamps. +ip_local_reserved_ports - list of comma separated ranges + Specify the ports which are reserved for known third-party + applications. These ports will not be used by automatic port + assignments (e.g. when calling connect() or bind() with port + number 0). Explicit port allocation behavior is unchanged. + + The format used for both input and output is a comma separated + list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and + 10). Writing to the file will clear all previously reserved + ports and update the current list with the one given in the + input. + + Note that ip_local_port_range and ip_local_reserved_ports + settings are independent and both are considered by the kernel + when determining which ports are available for automatic port + assignments. + + You can reserve ports which are not in the current + ip_local_port_range, e.g.: + + $ cat /proc/sys/net/ipv4/ip_local_port_range + 32000 61000 + $ cat /proc/sys/net/ipv4/ip_local_reserved_ports + 8080,9148 + + although this is redundant. However such a setting is useful + if later the port range is changed to a value that will + include the reserved ports. + + Default: Empty + ip_nonlocal_bind - BOOLEAN If set, allows processes to bind() to non-local IP addresses, which can be quite useful - but may break some applications. diff --git a/include/net/ip.h b/include/net/ip.h index 8149b77cea9..63548f0a44b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -184,6 +184,12 @@ extern struct local_ports { } sysctl_local_ports; extern void inet_get_local_port_range(int *low, int *high); +extern unsigned long *sysctl_local_reserved_ports; +static inline int inet_is_reserved_local_port(int port) +{ + return test_bit(port, sysctl_local_reserved_ports); +} + extern int sysctl_ip_default_ttl; extern int sysctl_ip_nonlocal_bind; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c6c43bcd1c6..551ce564b03 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1573,9 +1573,13 @@ static int __init inet_init(void) BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!sysctl_local_reserved_ports) + goto out; + rc = proto_register(&tcp_prot, 1); if (rc) - goto out; + goto out_free_reserved_ports; rc = proto_register(&udp_prot, 1); if (rc) @@ -1674,6 +1678,8 @@ out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); +out_free_reserved_ports: + kfree(sysctl_local_reserved_ports); goto out; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e0a3e3537b1..70eb3507c40 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __read_mostly = { .range = { 32768, 61000 }, }; +unsigned long *sysctl_local_reserved_ports; +EXPORT_SYMBOL(sysctl_local_reserved_ports); + void inet_get_local_port_range(int *low, int *high) { unsigned seq; @@ -108,6 +111,8 @@ again: smallest_size = -1; do { + if (inet_is_reserved_local_port(rover)) + goto next_nolock; head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; spin_lock(&head->lock); @@ -130,6 +135,7 @@ again: break; next: spin_unlock(&head->lock); + next_nolock: if (++rover > high) rover = low; } while (--remaining > 0); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2b79377b468..d3e160a8821 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; + if (inet_is_reserved_local_port(port)) + continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock(&head->lock); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1cd5c15174b..d96c1da4b17 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -299,6 +299,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_local_reserved_ports", + .data = NULL, /* initialized in sysctl_ipv4_init */ + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, #ifdef CONFIG_IP_MULTICAST { .procname = "igmp_max_memberships", @@ -736,6 +743,16 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; + struct ctl_table *i; + + for (i = ipv4_table; i->procname; i++) { + if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { + i->data = sysctl_local_reserved_ports; + break; + } + } + if (!i->procname) + return -EINVAL; hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); if (hdr == NULL) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f3e00c5cd1e..9de6a698f91 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -233,7 +233,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, */ do { if (low <= snum && snum <= high && - !test_bit(snum >> udptable->log, bitmap)) + !test_bit(snum >> udptable->log, bitmap) && + !inet_is_reserved_local_port(snum)) goto found; snum += rand; } while (snum != first); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ba1add0b13c..ca44917872d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5433,6 +5433,8 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) rover++; if ((rover < low) || (rover > high)) rover = low; + if (inet_is_reserved_local_port(rover)) + continue; index = sctp_phashfn(rover); head = &sctp_port_hashtable[index]; sctp_spin_lock(&head->lock); -- cgit v1.2.3-70-g09d2