diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 12 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 50 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 6 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 6 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 58 | ||||
-rw-r--r-- | net/ipv4/udp.c | 144 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 31 |
9 files changed, 235 insertions, 76 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 566ea6c4321..197d024b253 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1187,6 +1187,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) int proto; int ihl; int id; + unsigned int offset = 0; if (!(features & NETIF_F_V4_CSUM)) features &= ~NETIF_F_SG; @@ -1229,7 +1230,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) skb = segs; do { iph = ip_hdr(skb); - iph->id = htons(id++); + if (proto == IPPROTO_UDP) { + iph->id = htons(id); + iph->frag_off = htons(offset >> 3); + if (skb->next != NULL) + iph->frag_off |= htons(IP_MF); + offset += (skb->len - skb->mac_len - iph->ihl * 4); + } else + iph->id = htons(id++); iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); @@ -1425,6 +1433,8 @@ static struct net_protocol tcp_protocol = { static struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, + .gso_send_check = udp4_ufo_send_check, + .gso_segment = udp4_ufo_fragment, .no_policy = 1, .netns_ok = 1, }; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 63c2fa7b68c..d58b4911538 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -164,6 +164,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ static struct tnode *tnode_free_head; +static size_t tnode_free_size; + +/* + * synchronize_rcu after call_rcu for that many pages; it should be especially + * useful before resizing the root node with PREEMPT_NONE configs; the value was + * obtained experimentally, aiming to avoid visible slowdown. + */ +static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; @@ -319,6 +327,8 @@ static const int inflate_threshold = 50; static const int halve_threshold_root = 15; static const int inflate_threshold_root = 25; +static int inflate_threshold_root_fix; +#define INFLATE_FIX_MAX 10 /* a comment in resize() */ static void __alias_free_mem(struct rcu_head *head) { @@ -393,6 +403,8 @@ static void tnode_free_safe(struct tnode *tn) BUG_ON(IS_LEAF(tn)); tn->tnode_free = tnode_free_head; tnode_free_head = tn; + tnode_free_size += sizeof(struct tnode) + + (sizeof(struct node *) << tn->bits); } static void tnode_free_flush(void) @@ -404,6 +416,11 @@ static void tnode_free_flush(void) tn->tnode_free = NULL; tnode_free(tn); } + + if (tnode_free_size >= PAGE_SIZE * sync_pages) { + tnode_free_size = 0; + synchronize_rcu(); + } } static struct leaf *leaf_new(void) @@ -602,7 +619,8 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ if (!tn->parent) - inflate_threshold_use = inflate_threshold_root; + inflate_threshold_use = inflate_threshold_root + + inflate_threshold_root_fix; else inflate_threshold_use = inflate_threshold; @@ -626,15 +644,27 @@ static struct node *resize(struct trie *t, struct tnode *tn) } if (max_resize < 0) { - if (!tn->parent) - pr_warning("Fix inflate_threshold_root." - " Now=%d size=%d bits\n", - inflate_threshold_root, tn->bits); - else + if (!tn->parent) { + /* + * It was observed that during large updates even + * inflate_threshold_root = 35 might be needed to avoid + * this warning; but it should be temporary, so let's + * try to handle this automatically. + */ + if (inflate_threshold_root_fix < INFLATE_FIX_MAX) + inflate_threshold_root_fix++; + else + pr_warning("Fix inflate_threshold_root." + " Now=%d size=%d bits fix=%d\n", + inflate_threshold_root, tn->bits, + inflate_threshold_root_fix); + } else { pr_warning("Fix inflate_threshold." " Now=%d size=%d bits\n", inflate_threshold, tn->bits); - } + } + } else if (max_resize > 3 && !tn->parent && inflate_threshold_root_fix) + inflate_threshold_root_fix--; check_tnode(tn); @@ -1435,7 +1465,7 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), pos, bits); - n = tnode_get_child(pn, cindex); + n = tnode_get_child_rcu(pn, cindex); if (n == NULL) { #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -1570,7 +1600,7 @@ backtrace: if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - struct tnode *parent = node_parent((struct node *) pn); + struct tnode *parent = node_parent_rcu((struct node *) pn); if (!parent) goto failed; @@ -1783,7 +1813,7 @@ static struct leaf *trie_firstleaf(struct trie *t) static struct leaf *trie_nextleaf(struct leaf *l) { struct node *c = (struct node *) l; - struct tnode *p = node_parent(c); + struct tnode *p = node_parent_rcu(c); if (!p) return NULL; /* trie with just one leaf */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index cb4a0f4bd5e..b902ef55be7 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -821,7 +821,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) stats->tx_dropped++; dev_kfree_skb(skb); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); @@ -889,7 +889,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) IPTUNNEL_XMIT(); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); @@ -898,7 +898,7 @@ tx_error: stats->tx_errors++; dev_kfree_skb(skb); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } static int ipgre_tunnel_bind_dev(struct net_device *dev) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 93e2b787da2..98075b6d619 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -486,7 +486,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) stats->tx_dropped++; dev_kfree_skb(skb); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); @@ -524,7 +524,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) IPTUNNEL_XMIT(); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); @@ -532,7 +532,7 @@ tx_error: stats->tx_errors++; dev_kfree_skb(skb); tunnel->recursion--; - return 0; + return NETDEV_TX_OK; } static void ipip_tunnel_bind_dev(struct net_device *dev) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9a8da5ed92b..06c33fb6b32 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -212,7 +212,7 @@ static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); - return 0; + return NETDEV_TX_OK; } static const struct net_device_ops reg_vif_netdev_ops = { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 278f46f5011..fafbe163e2b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3442,7 +3442,7 @@ int __init ip_rt_init(void) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); - xfrm4_init(); + xfrm4_init(ip_rt_max_size); #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bd62712848f..4e004424d40 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -59,6 +59,7 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +/* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -142,6 +143,7 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) tp->snd_cwnd_used = 0; } +/* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, struct sk_buff *skb, struct sock *sk) { @@ -161,6 +163,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, icsk->icsk_ack.pingpong = 1; } +/* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { tcp_dec_quickack_mode(sk, pkts); @@ -276,6 +279,7 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } +/* Packet ECN state for a SYN-ACK */ static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; @@ -283,6 +287,7 @@ static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; } +/* Packet ECN state for a SYN. */ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -301,6 +306,9 @@ TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) th->ece = 1; } +/* Set up ECN state for a packet on a ESTABLISHED socket that is about to + * be sent. + */ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, int tcp_header_len) { @@ -362,7 +370,9 @@ struct tcp_out_options { __u32 tsval, tsecr; /* need to include OPTION_TS */ }; -/* Beware: Something in the Internet is very sensitive to the ordering of +/* Write previously computed TCP options to the packet. + * + * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from * inter-operatibility perspective it seems that we're somewhat stuck with @@ -445,6 +455,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } } +/* Compute TCP options for SYN packets. This is not the final + * network wire format yet. + */ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { @@ -493,6 +506,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, return size; } +/* Set up TCP options for SYN-ACKs. */ static unsigned tcp_synack_options(struct sock *sk, struct request_sock *req, unsigned mss, struct sk_buff *skb, @@ -541,6 +555,9 @@ static unsigned tcp_synack_options(struct sock *sk, return size; } +/* Compute TCP options for ESTABLISHED sockets. This is not the + * final wire format yet. + */ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { @@ -705,7 +722,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, return net_xmit_eval(err); } -/* This routine just queue's the buffer +/* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. @@ -722,6 +739,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) sk_mem_charge(sk, skb->truesize); } +/* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { @@ -909,6 +927,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) skb->len = skb->data_len; } +/* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) @@ -937,7 +956,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } -/* Not accounting for SACKs here. */ +/* Calculate MSS. Not accounting for SACKs here. */ int tcp_mtu_to_mss(struct sock *sk, int pmtu) { struct tcp_sock *tp = tcp_sk(sk); @@ -981,6 +1000,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) return mtu; } +/* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1143,7 +1163,8 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, return 0; } -/* This must be invoked the first time we consider transmitting +/* Intialize TSO state of a skb. + * This must be invoked the first time we consider transmitting * SKB onto the wire. */ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, @@ -1158,6 +1179,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, return tso_segs; } +/* Minshall's variant of the Nagle send check. */ static inline int tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && @@ -1242,6 +1264,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, return cwnd_quota; } +/* Test if sending is allowed right now. */ int tcp_may_send_now(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1378,6 +1401,10 @@ send_now: } /* Create a new MTU probe if we are ready. + * MTU probe is regularly attempting to increase the path MTU by + * deliberately sending larger packets. This discovers routing + * changes resulting in larger path MTUs. + * * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, * -1 otherwise @@ -1790,6 +1817,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) sk_wmem_free_skb(sk, next_skb); } +/* Check if coalescing SKBs is legal. */ static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) @@ -1808,6 +1836,9 @@ static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) return 1; } +/* Collapse packets in the retransmit queue to make to create + * less packets on the wire. This is only done on retransmission. + */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, int space) { @@ -1957,6 +1988,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return err; } +/* Check if we forward retransmits are possible in the current + * window/congestion state. + */ static int tcp_can_forward_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2145,7 +2179,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); } -/* WARNING: This routine must only be called when we have already sent +/* Send a crossed SYN-ACK during socket establishment. + * WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. @@ -2180,9 +2215,7 @@ int tcp_send_synack(struct sock *sk) return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } -/* - * Prepare a SYN-ACK. - */ +/* Prepare a SYN-ACK. */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req) { @@ -2269,9 +2302,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, return skb; } -/* - * Do all connect socket setups that can be done AF independent. - */ +/* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); @@ -2330,9 +2361,7 @@ static void tcp_connect_init(struct sock *sk) tcp_clear_retrans(tp); } -/* - * Build a SYN and send it off. - */ +/* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2493,6 +2522,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } +/* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 80e3812837a..29ebb0d27a1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -110,11 +110,12 @@ struct udp_table udp_table; EXPORT_SYMBOL(udp_table); int sysctl_udp_mem[3] __read_mostly; -int sysctl_udp_rmem_min __read_mostly; -int sysctl_udp_wmem_min __read_mostly; - EXPORT_SYMBOL(sysctl_udp_mem); + +int sysctl_udp_rmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_rmem_min); + +int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_wmem_min); atomic_t udp_memory_allocated; @@ -158,7 +159,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, */ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2 ) ) + const struct sock *sk2)) { struct udp_hslot *hslot; struct udp_table *udptable = sk->sk_prot->h.udp_table; @@ -221,14 +222,15 @@ fail_unlock: fail: return error; } +EXPORT_SYMBOL(udp_lib_get_port); static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - return ( !ipv6_only_sock(sk2) && - (!inet1->rcv_saddr || !inet2->rcv_saddr || - inet1->rcv_saddr == inet2->rcv_saddr )); + return (!ipv6_only_sock(sk2) && + (!inet1->rcv_saddr || !inet2->rcv_saddr || + inet1->rcv_saddr == inet2->rcv_saddr)); } int udp_v4_get_port(struct sock *sk, unsigned short snum) @@ -383,8 +385,8 @@ found: void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; - struct iphdr *iph = (struct iphdr*)skb->data; - struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2)); + struct iphdr *iph = (struct iphdr *)skb->data; + struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct sock *sk; @@ -439,7 +441,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { - ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); + ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); } sk->sk_err = err; sk->sk_error_report(sk); @@ -474,7 +476,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames); * (checksum field must be zeroed out) */ static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, int len ) + __be32 src, __be32 dst, int len) { unsigned int offset; struct udphdr *uh = udp_hdr(skb); @@ -545,7 +547,7 @@ static int udp_push_pending_frames(struct sock *sk) } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ - udp4_hwcsum_outgoing(sk, skb, fl->fl4_src,fl->fl4_dst, up->len); + udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); goto send; } else /* `normal' UDP */ @@ -553,7 +555,7 @@ static int udp_push_pending_frames(struct sock *sk) /* add protocol-dependent pseudo-header */ uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, - sk->sk_protocol, csum ); + sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; @@ -592,7 +594,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Check the flags. */ - if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; ipc.opt = NULL; @@ -619,7 +621,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name; + struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name; if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -684,7 +686,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } if (connected) - rt = (struct rtable*)sk_dst_check(sk, 0); + rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { struct flowi fl = { .oif = ipc.oif, @@ -782,6 +784,7 @@ do_confirm: err = 0; goto out; } +EXPORT_SYMBOL(udp_sendmsg); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) @@ -871,6 +874,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) return 0; } +EXPORT_SYMBOL(udp_ioctl); /* * This should be easy, if there is something there we @@ -892,7 +896,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Check any passed addresses */ if (addr_len) - *addr_len=sizeof(*sin); + *addr_len = sizeof(*sin); if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len); @@ -923,9 +927,11 @@ try_again: if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied ); + msg->msg_iov, copied); else { - err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + err = skb_copy_and_csum_datagram_iovec(skb, + sizeof(struct udphdr), + msg->msg_iov); if (err == -EINVAL) goto csum_copy_err; @@ -941,8 +947,7 @@ try_again: sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ - if (sin) - { + if (sin) { sin->sin_family = AF_INET; sin->sin_port = udp_hdr(skb)->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; @@ -995,6 +1000,7 @@ int udp_disconnect(struct sock *sk, int flags) sk_dst_reset(sk); return 0; } +EXPORT_SYMBOL(udp_disconnect); void udp_lib_unhash(struct sock *sk) { @@ -1044,7 +1050,7 @@ drop: * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int rc; @@ -1214,7 +1220,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, proto, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -1355,7 +1361,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int err = 0; int is_udplite = IS_UDPLITE(sk); - if (optlen<sizeof(int)) + if (optlen < sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) @@ -1426,6 +1432,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return err; } +EXPORT_SYMBOL(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) @@ -1453,7 +1460,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, struct udp_sock *up = udp_sk(sk); int val, len; - if (get_user(len,optlen)) + if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); @@ -1486,10 +1493,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &val,len)) + if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } +EXPORT_SYMBOL(udp_lib_getsockopt); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -1528,9 +1536,9 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) int is_lite = IS_UDPLITE(sk); /* Check for false positives due to checksum errors */ - if ( (mask & POLLRDNORM) && - !(file->f_flags & O_NONBLOCK) && - !(sk->sk_shutdown & RCV_SHUTDOWN)){ + if ((mask & POLLRDNORM) && + !(file->f_flags & O_NONBLOCK) && + !(sk->sk_shutdown & RCV_SHUTDOWN)) { struct sk_buff_head *rcvq = &sk->sk_receive_queue; struct sk_buff *skb; @@ -1552,6 +1560,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) return mask; } +EXPORT_SYMBOL(udp_poll); struct proto udp_prot = { .name = "UDP", @@ -1582,6 +1591,7 @@ struct proto udp_prot = { .compat_getsockopt = compat_udp_getsockopt, #endif }; +EXPORT_SYMBOL(udp_prot); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS @@ -1703,11 +1713,13 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) rc = -ENOMEM; return rc; } +EXPORT_SYMBOL(udp_proc_register); void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo) { proc_net_remove(net, afinfo->name); } +EXPORT_SYMBOL(udp_proc_unregister); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, @@ -1741,7 +1753,7 @@ int udp4_seq_show(struct seq_file *seq, void *v) int len; udp4_format_sock(v, seq, state->bucket, &len); - seq_printf(seq, "%*s\n", 127 - len ,""); + seq_printf(seq, "%*s\n", 127 - len, ""); } return 0; } @@ -1816,16 +1828,64 @@ void __init udp_init(void) sysctl_udp_wmem_min = SK_MEM_QUANTUM; } -EXPORT_SYMBOL(udp_disconnect); -EXPORT_SYMBOL(udp_ioctl); -EXPORT_SYMBOL(udp_prot); -EXPORT_SYMBOL(udp_sendmsg); -EXPORT_SYMBOL(udp_lib_getsockopt); -EXPORT_SYMBOL(udp_lib_setsockopt); -EXPORT_SYMBOL(udp_poll); -EXPORT_SYMBOL(udp_lib_get_port); +int udp4_ufo_send_check(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(*uh))) + return -EINVAL; + + iph = ip_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + return 0; +} + +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + int offset; + __wsum csum; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + !(type & (SKB_GSO_UDP)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + offset = skb->csum_start - skb_headroom(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Fragment the skb. IP headers of the fragments are updated in + * inet_gso_segment() + */ + segs = skb_segment(skb, features); +out: + return segs; +} -#ifdef CONFIG_PROC_FS -EXPORT_SYMBOL(udp_proc_register); -EXPORT_SYMBOL(udp_proc_unregister); -#endif diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 0071ee6f441..1ba44742ebb 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -264,6 +264,20 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .fill_dst = xfrm4_fill_dst, }; +static struct ctl_table xfrm4_policy_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "xfrm4_gc_thresh", + .data = &xfrm4_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table_header *sysctl_hdr; + static void __init xfrm4_policy_init(void) { xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); @@ -271,12 +285,27 @@ static void __init xfrm4_policy_init(void) static void __exit xfrm4_policy_fini(void) { + if (sysctl_hdr) + unregister_net_sysctl_table(sysctl_hdr); xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); } -void __init xfrm4_init(void) +void __init xfrm4_init(int rt_max_size) { xfrm4_state_init(); xfrm4_policy_init(); + /* + * Select a default value for the gc_thresh based on the main route + * table hash size. It seems to me the worst case scenario is when + * we have ipsec operating in transport mode, in which we create a + * dst_entry per socket. The xfrm gc algorithm starts trying to remove + * entries at gc_thresh, and prevents new allocations as 2*gc_thresh + * so lets set an initial xfrm gc_thresh value at the rt_max_size/2. + * That will let us store an ipsec connection per route table entry, + * and start cleaning when were 1/2 full + */ + xfrm4_dst_ops.gc_thresh = rt_max_size/2; + sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, + xfrm4_policy_table); } |