diff options
Diffstat (limited to 'net/ipv4')
53 files changed, 1128 insertions, 597 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f8c49ce5b28..f032688d20d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -55,4 +55,4 @@ obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ - xfrm4_output.o + xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 19ab78aca54..6d6dd345bc4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1505,9 +1505,9 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) bhptr = per_cpu_ptr(mib[0], cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { - start = u64_stats_fetch_begin_bh(syncp); + start = u64_stats_fetch_begin_irq(syncp); v = *(((u64 *) bhptr) + offt); - } while (u64_stats_fetch_retry_bh(syncp, start)); + } while (u64_stats_fetch_retry_irq(syncp, start)); res += v; } @@ -1650,6 +1650,39 @@ static int __init init_ipv4_mibs(void) return register_pernet_subsys(&ipv4_mib_ops); } +static __net_init int inet_init_net(struct net *net) +{ + /* + * Set defaults for local port range + */ + seqlock_init(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = 32768; + net->ipv4.ip_local_ports.range[1] = 61000; + + seqlock_init(&net->ipv4.ping_group_range.lock); + /* + * Sane defaults - nobody may create ping sockets. + * Boot scripts should set this to distro-specific group. + */ + net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); + net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); + return 0; +} + +static __net_exit void inet_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations af_inet_ops = { + .init = inet_init_net, + .exit = inet_exit_net, +}; + +static int __init init_inet_pernet_ops(void) +{ + return register_pernet_subsys(&af_inet_ops); +} + static int ipv4_proc_init(void); /* @@ -1794,6 +1827,9 @@ static int __init inet_init(void) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif + + if (init_inet_pernet_ops()) + pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); /* * Initialise per-cpu ipv4 mibs */ diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 717902669d2..a2afa89513a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; @@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah = ip_auth_hdr(skb); ihl = ip_hdrlen(skb); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } err = -ENOMEM; - iph = ah_alloc_tmp(ahash, nfrags, ihl); + iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len); if (!iph) goto out; - - icv = ah_tmp_icv(ahash, iph, ihl); + seqhi = (__be32 *)((char *)iph + ihl); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memset(ah->auth_data, 0, ahp->icv_trunc_len); @@ -210,10 +219,15 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_output_done, skb); AH_SKB_CB(skb)->tmp = iph; @@ -295,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) struct ip_auth_hdr *ah; struct ah_data *ahp; int err = -ENOMEM; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; @@ -335,14 +353,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) iph = ip_hdr(skb); ihl = ip_hdrlen(skb); - work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } + + work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + + ahp->icv_trunc_len + seqhi_len); if (!work_iph) goto out; - auth_data = ah_tmp_auth(work_iph, ihl); + seqhi = (__be32 *)((char *)work_iph + ihl); + auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memcpy(work_iph, iph, ihl); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); @@ -361,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, ihl); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; @@ -397,7 +428,7 @@ out: return err; } -static void ah4_err(struct sk_buff *skb, u32 info) +static int ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -407,23 +438,25 @@ static void ah4_err(struct sk_buff *skb, u32 info) switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + return 0; case ICMP_REDIRECT: break; default: - return; + return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) - return; + return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); xfrm_state_put(x); + + return 0; } static int ah_init_state(struct xfrm_state *x) @@ -505,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x) kfree(ahp); } +static int ah4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} static const struct xfrm_type ah_type = { @@ -518,11 +555,12 @@ static const struct xfrm_type ah_type = .output = ah_output }; -static const struct net_protocol ah4_protocol = { +static struct xfrm4_protocol ah4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = ah4_rcv_cb, .err_handler = ah4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init ah4_init(void) @@ -531,7 +569,7 @@ static int __init ah4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { + if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); return -EAGAIN; @@ -541,7 +579,7 @@ static int __init ah4_init(void) static void __exit ah4_fini(void) { - if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) + if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ah_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 7785b28061a..360b565918c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -473,7 +473,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) net_adj) & ~(blksize - 1)) + net_adj - 2; } -static void esp4_err(struct sk_buff *skb, u32 info) +static int esp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -483,23 +483,25 @@ static void esp4_err(struct sk_buff *skb, u32 info) switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + return 0; case ICMP_REDIRECT: break; default: - return; + return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) - return; + return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); xfrm_state_put(x); + + return 0; } static void esp_destroy(struct xfrm_state *x) @@ -672,6 +674,11 @@ error: return err; } +static int esp4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type esp_type = { .description = "ESP4", @@ -685,11 +692,12 @@ static const struct xfrm_type esp_type = .output = esp_output }; -static const struct net_protocol esp4_protocol = { +static struct xfrm4_protocol esp4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = esp4_rcv_cb, .err_handler = esp4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init esp4_init(void) @@ -698,7 +706,7 @@ static int __init esp4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { + if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&esp_type, AF_INET); return -EAGAIN; @@ -708,7 +716,7 @@ static int __init esp4_init(void) static void __exit esp4_fini(void) { - if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) + if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&esp_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c7539e22868..255aa9946fe 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -250,7 +250,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif; + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; @@ -659,7 +659,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) - return ip_rt_dump(skb, cb); + return skb->len; s_h = cb->args[0]; s_e = cb->args[1]; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b53f0bf84dc..b10cd43a472 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -631,6 +631,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, .daddr = nh->nh_gw, .flowi4_scope = cfg->fc_scope + 1, .flowi4_oif = nh->nh_oif, + .flowi4_iif = LOOPBACK_IFINDEX, }; /* It is not necessary, but requires a bit of thinking */ @@ -820,13 +821,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (fi == NULL) goto failure; + fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); if (!fi->fib_metrics) goto failure; } else fi->fib_metrics = (u32 *) dst_default_metrics; - fib_info_cnt++; fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0d1e2cb877e..a56b8e6e866 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,11 +37,11 @@ void inet_get_local_port_range(struct net *net, int *low, int *high) unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); - *low = net->ipv4.sysctl_local_ports.range[0]; - *high = net->ipv4.sysctl_local_ports.range[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + *low = net->ipv4.ip_local_ports.range[0]; + *high = net->ipv4.ip_local_ports.range[1]; + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index f3869c186d9..6f111e48e11 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -42,12 +42,12 @@ static bool ip_may_fragment(const struct sk_buff *skb) { return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || - !skb->local_df; + skb->local_df; } static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { - if (skb->len <= mtu || skb->local_df) + if (skb->len <= mtu) return false; if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) @@ -56,53 +56,6 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } -static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb) -{ - unsigned int mtu; - - if (skb->local_df || !skb_is_gso(skb)) - return false; - - mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true); - - /* if seglen > mtu, do software segmentation for IP fragmentation on - * output. DF bit cannot be set since ip_forward would have sent - * icmp error. - */ - return skb_gso_network_seglen(skb) > mtu; -} - -/* called if GSO skb needs to be fragmented on forward */ -static int ip_forward_finish_gso(struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - netdev_features_t features; - struct sk_buff *segs; - int ret = 0; - - features = netif_skb_dev_features(skb, dst->dev); - segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); - if (IS_ERR(segs)) { - kfree_skb(skb); - return -ENOMEM; - } - - consume_skb(skb); - - do { - struct sk_buff *nskb = segs->next; - int err; - - segs->next = NULL; - err = dst_output(segs); - - if (err && ret == 0) - ret = err; - segs = nskb; - } while (segs); - - return ret; -} static int ip_forward_finish(struct sk_buff *skb) { @@ -114,9 +67,6 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); - if (ip_gso_exceeds_dst_mtu(skb)) - return ip_forward_finish_gso(skb); - return dst_output(skb); } @@ -127,6 +77,10 @@ int ip_forward(struct sk_buff *skb) struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); + /* that should never happen */ + if (skb->pkt_type != PACKET_HOST) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; @@ -136,9 +90,6 @@ int ip_forward(struct sk_buff *skb) if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) return NET_RX_SUCCESS; - if (skb->pkt_type != PACKET_HOST) - goto drop; - skb_forward_csum(skb); /* diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index c10a3ce5cbf..ed32313e307 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -232,8 +232,9 @@ static void ip_expire(unsigned long arg) * "Fragment Reassembly Timeout" message, per RFC792. */ if (qp->user == IP_DEFRAG_AF_PACKET || - (qp->user == IP_DEFRAG_CONNTRACK_IN && - skb_rtable(head)->rt_type != RTN_LOCAL)) + ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && + (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && + (skb_rtable(head)->rt_type != RTN_LOCAL))) goto out_rcu_unlock; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ec4f762efda..94213c89156 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -463,6 +463,7 @@ static const struct net_device_ops ipgre_netdev_ops = { static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; + dev->type = ARPHRD_IPGRE; ip_tunnel_setup(dev, ipgre_net_id); } @@ -501,7 +502,6 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->type = ARPHRD_IPGRE; dev->flags = IFF_NOARP; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->addr_len = 4; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 73c6b63bba7..a52f50187b5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -101,17 +101,17 @@ int __ip_local_out(struct sk_buff *skb) skb_dst(skb)->dev, dst_output); } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(skb); if (likely(err == 1)) - err = dst_output(skb); + err = dst_output_sk(sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out); +EXPORT_SYMBOL_GPL(ip_local_out_sk); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -211,6 +211,48 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } +static int ip_finish_output_gso(struct sk_buff *skb) +{ + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + /* common case: locally created skb or seglen is <= mtu */ + if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || + skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + return ip_finish_output2(skb); + + /* Slowpath - GSO segment length is exceeding the dst MTU. + * + * This can happen in two cases: + * 1) TCP GRO packet, DF bit not set + * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly + * from host network stack. + */ + features = netif_skb_features(skb); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = ip_fragment(segs, ip_finish_output2); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; +} + static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) @@ -220,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) + if (skb_is_gso(skb)) + return ip_finish_output_gso(skb); + + if (skb->len > ip_skb_dst_mtu(skb)) return ip_fragment(skb, ip_finish_output2); - else - return ip_finish_output2(skb); + + return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -287,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -315,9 +359,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) sizeof(fl4->saddr) + sizeof(fl4->daddr)); } -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; @@ -389,6 +433,7 @@ packet_routed: ip_select_ident_more(skb, &rt->dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); + /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -446,7 +491,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); int err = 0; - bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; dev = rt->dst.dev; @@ -456,7 +500,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); - mtu = ip_dst_mtu_maybe_forward(&rt->dst, forwarding); + mtu = ip_skb_dst_mtu(skb); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { @@ -822,8 +866,7 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? - mtu : 0xFFFF; + maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, @@ -1146,8 +1189,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? - mtu : 0xFFFF; + maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; if (cork->length + size > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, @@ -1308,8 +1350,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - if (inet->pmtudisc < IP_PMTUDISC_DO) - skb->local_df = 1; + skb->local_df = ip_sk_local_df(sk); /* DF bit is set when we want to see DF on outgoing frames. * If local_df is set too, we still allow to fragment this frame diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 580dd96666e..64741b93863 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -186,7 +186,8 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) } EXPORT_SYMBOL(ip_cmsg_recv); -int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) +int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, + bool allow_ipv6) { int err, val; struct cmsghdr *cmsg; @@ -194,6 +195,22 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; +#if defined(CONFIG_IPV6) + if (allow_ipv6 && + cmsg->cmsg_level == SOL_IPV6 && + cmsg->cmsg_type == IPV6_PKTINFO) { + struct in6_pktinfo *src_info; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) + return -EINVAL; + src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) + return -EINVAL; + ipc->oif = src_info->ipi6_ifindex; + ipc->addr = src_info->ipi6_addr.s6_addr32[3]; + continue; + } +#endif if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { @@ -626,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->nodefrag = val ? 1 : 0; break; case IP_MTU_DISCOVER: - if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE) + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; inet->pmtudisc = val; break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index a82a22d8f77..2acc2337d38 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -235,13 +235,17 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, { unsigned int h; __be32 remote; + __be32 i_key = parms->i_key; if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) remote = parms->iph.daddr; else remote = 0; - h = ip_tunnel_hash(parms->i_key, remote); + if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) + i_key = 0; + + h = ip_tunnel_hash(i_key, remote); return &itn->tunnels[h]; } @@ -398,7 +402,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, fbt = netdev_priv(itn->fb_tunnel_dev); dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); if (IS_ERR(dev)) - return NULL; + return ERR_CAST(dev); dev->mtu = ip_tunnel_bind_dev(dev); @@ -438,6 +442,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tunnel->i_seqno = ntohl(tpi->seq) + 1; } + skb_reset_network_header(skb); + err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) @@ -534,9 +540,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, unsigned int max_headroom; /* The extra header space needed */ __be32 dst; int err; - bool connected = true; + bool connected; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + connected = (tunnel->parms.iph.daddr != 0); dst = tnl_params->daddr; if (dst == 0) { @@ -666,7 +673,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, return; } - err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); @@ -718,19 +725,18 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; - struct ip_tunnel *t; - struct net *net = dev_net(dev); - struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = t->net; + struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); BUG_ON(!itn->fb_tunnel_dev); switch (cmd) { case SIOCGETTUNNEL: - t = NULL; - if (dev == itn->fb_tunnel_dev) + if (dev == itn->fb_tunnel_dev) { t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (t == NULL) - t = netdev_priv(dev); + if (t == NULL) + t = netdev_priv(dev); + } memcpy(p, &t->parms, sizeof(*p)); break; @@ -748,9 +754,13 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (!t && (cmd == SIOCADDTUNNEL)) + if (!t && (cmd == SIOCADDTUNNEL)) { t = ip_tunnel_create(net, itn, p); - + if (IS_ERR(t)) { + err = PTR_ERR(t); + break; + } + } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { @@ -777,8 +787,9 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) if (t) { err = 0; ip_tunnel_update(itn, t, dev, p, true); - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + } else { + err = -ENOENT; + } break; case SIOCDELTUNNEL: @@ -872,6 +883,7 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, */ if (!IS_ERR(itn->fb_tunnel_dev)) { itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); } rtnl_unlock(); @@ -993,19 +1005,13 @@ int ip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - int i, err; + int err; dev->destructor = ip_tunnel_dev_free; - dev->tstats = alloc_percpu(struct pcpu_sw_netstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_sw_netstats *ipt_stats; - ipt_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ipt_stats->syncp); - } - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); if (!tunnel->dst_cache) { free_percpu(dev->tstats); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 8d69626f220..bcf206c7900 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,7 +46,7 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> -int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { @@ -76,7 +76,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, iph->ttl = ttl; __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); - err = ip_local_out(skb); + err = ip_local_out_sk(sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; @@ -162,12 +162,12 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, unsigned int start; do { - start = u64_stats_fetch_begin_bh(&tstats->syncp); + start = u64_stats_fetch_begin_irq(&tstats->syncp); rx_packets = tstats->rx_packets; tx_packets = tstats->tx_packets; rx_bytes = tstats->rx_bytes; tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); tot->rx_packets += rx_packets; tot->tx_packets += tx_packets; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 48eafae5176..13ef00f1e17 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -34,6 +34,7 @@ #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> +#include <linux/icmpv6.h> #include <net/sock.h> #include <net/ip.h> @@ -49,8 +50,8 @@ static struct rtnl_link_ops vti_link_ops __read_mostly; static int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); -/* We dont digest the packet therefore let the packet pass */ -static int vti_rcv(struct sk_buff *skb) +static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); @@ -60,79 +61,120 @@ static int vti_rcv(struct sk_buff *skb) tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel != NULL) { - struct pcpu_sw_netstats *tstats; - u32 oldmark = skb->mark; - int ret; - - - /* temporarily mark the skb with the tunnel o_key, to - * only match policies with this mark. - */ - skb->mark = be32_to_cpu(tunnel->parms.o_key); - ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb); - skb->mark = oldmark; - if (!ret) - return -1; - - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - - secpath_reset(skb); - skb->dev = tunnel->dev; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + skb->mark = be32_to_cpu(tunnel->parms.i_key); + + return xfrm_input(skb, nexthdr, spi, encap_type); + } + + return -EINVAL; +drop: + kfree_skb(skb); + return 0; +} + +static int vti_rcv(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); +} + +static int vti_rcv_cb(struct sk_buff *skb, int err) +{ + unsigned short family; + struct net_device *dev; + struct pcpu_sw_netstats *tstats; + struct xfrm_state *x; + struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; + + if (!tunnel) return 1; + + dev = tunnel->dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; } - return -1; + x = xfrm_input_state(skb); + family = x->inner_mode->afinfo->family; + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + return -EPERM; + + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); + skb->dev = dev; + + tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; } -/* This function assumes it is being called from dev_queue_xmit() - * and that skb is filled properly by that function. - */ +static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) +{ + xfrm_address_t *daddr = (xfrm_address_t *)&dst; + xfrm_address_t *saddr = (xfrm_address_t *)&src; -static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) + /* if there is no transform then this tunnel is not functional. + * Or if the xfrm is not mode tunnel. + */ + if (!x || x->props.mode != XFRM_MODE_TUNNEL || + x->props.family != AF_INET) + return false; + + if (!dst) + return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); + + if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) + return false; + + return true; +} + +static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, + struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct iphdr *tiph = &tunnel->parms.iph; - u8 tos; - struct rtable *rt; /* Route to the other host */ + struct ip_tunnel_parm *parms = &tunnel->parms; + struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - __be32 dst = tiph->daddr; - struct flowi4 fl4; int err; - if (skb->protocol != htons(ETH_P_IP)) - goto tx_error; - - tos = old_iph->tos; + if (!dst) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } - memset(&fl4, 0, sizeof(fl4)); - flowi4_init_output(&fl4, tunnel->parms.link, - be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos), - RT_SCOPE_UNIVERSE, - IPPROTO_IPIP, 0, - dst, tiph->saddr, 0, 0); - rt = ip_route_output_key(dev_net(dev), &fl4); - if (IS_ERR(rt)) { + dst_hold(dst); + dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { dev->stats.tx_carrier_errors++; goto tx_error_icmp; } - /* if there is no transform then this tunnel is not functional. - * Or if the xfrm is not mode tunnel. - */ - if (!rt->dst.xfrm || - rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) { + + if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { dev->stats.tx_carrier_errors++; - ip_rt_put(rt); + dst_release(dst); goto tx_error_icmp; } - tdev = rt->dst.dev; + + tdev = dst->dev; if (tdev == dev) { - ip_rt_put(rt); + dst_release(dst); dev->stats.collisions++; goto tx_error; } @@ -146,10 +188,8 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) tunnel->err_count = 0; } - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - nf_reset(skb); + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); + skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; err = dst_output(skb); @@ -166,6 +206,98 @@ tx_error: return NETDEV_TX_OK; } +/* This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ +static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + + skb->mark = be32_to_cpu(tunnel->parms.o_key); + + switch (skb->protocol) { + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + break; + case htons(ETH_P_IPV6): + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + default: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + + return vti_xmit(skb, dev, &fl); +} + +static int vti4_err(struct sk_buff *skb, u32 info) +{ + __be32 spi; + __u32 mark; + struct xfrm_state *x; + struct ip_tunnel *tunnel; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah ; + struct ip_comp_hdr *ipch; + struct net *net = dev_net(skb->dev); + const struct iphdr *iph = (const struct iphdr *)skb->data; + int protocol = iph->protocol; + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (!tunnel) + return -1; + + mark = be32_to_cpu(tunnel->parms.o_key); + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } + + x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET); + if (!x) + return 0; + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + else + ipv4_redirect(skb, net, 0, 0, protocol, 0); + xfrm_state_put(x); + + return 0; +} + static int vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -181,12 +313,13 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EINVAL; } + p.i_flags |= VTI_ISVTI; err = ip_tunnel_ioctl(dev, &p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p.i_flags |= GRE_KEY | VTI_ISVTI; + p.i_flags |= GRE_KEY; p.o_flags |= GRE_KEY; } @@ -207,6 +340,7 @@ static const struct net_device_ops vti_netdev_ops = { static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; + dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } @@ -218,13 +352,11 @@ static int vti_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); dev->mtu = ETH_DATA_LEN; dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; - dev->features |= NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; @@ -241,9 +373,28 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; } -static struct xfrm_tunnel_notifier vti_handler __read_mostly = { +static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { + .handler = vti_rcv, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, +}; + +static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { .handler = vti_rcv, - .priority = 1, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, +}; + +static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { + .handler = vti_rcv, + .input_handler = vti_input, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 100, }; static int __net_init vti_init_net(struct net *net) @@ -287,6 +438,8 @@ static void vti_netlink_parms(struct nlattr *data[], if (!data) return; + parms->i_flags = VTI_ISVTI; + if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); @@ -382,10 +535,31 @@ static int __init vti_init(void) err = register_pernet_device(&vti_net_ops); if (err < 0) return err; - err = xfrm4_mode_tunnel_input_register(&vti_handler); + err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); if (err < 0) { unregister_pernet_device(&vti_net_ops); pr_info("vti init: can't register tunnel\n"); + + return err; + } + + err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); + if (err < 0) { + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); + unregister_pernet_device(&vti_net_ops); + pr_info("vti init: can't register tunnel\n"); + + return err; + } + + err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); + if (err < 0) { + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); + unregister_pernet_device(&vti_net_ops); + pr_info("vti init: can't register tunnel\n"); + + return err; } err = rtnl_link_register(&vti_link_ops); @@ -395,7 +569,9 @@ static int __init vti_init(void) return err; rtnl_link_failed: - xfrm4_mode_tunnel_input_deregister(&vti_handler); + xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); return err; } @@ -403,8 +579,13 @@ rtnl_link_failed: static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); - if (xfrm4_mode_tunnel_input_deregister(&vti_handler)) + if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) pr_info("vti close: can't deregister tunnel\n"); + if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH)) + pr_info("vti close: can't deregister tunnel\n"); + if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP)) + pr_info("vti close: can't deregister tunnel\n"); + unregister_pernet_device(&vti_net_ops); } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 826be4cb482..c0855d50a3f 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -23,7 +23,7 @@ #include <net/protocol.h> #include <net/sock.h> -static void ipcomp4_err(struct sk_buff *skb, u32 info) +static int ipcomp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); __be32 spi; @@ -34,24 +34,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return; + return 0; case ICMP_REDIRECT: break; default: - return; + return 0; } spi = htonl(ntohs(ipch->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) - return; + return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); xfrm_state_put(x); + + return 0; } /* We always hold one tunnel user reference to indicate a tunnel */ @@ -147,6 +149,11 @@ out: return err; } +static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type ipcomp_type = { .description = "IPCOMP4", .owner = THIS_MODULE, @@ -157,11 +164,12 @@ static const struct xfrm_type ipcomp_type = { .output = ipcomp_output }; -static const struct net_protocol ipcomp4_protocol = { +static struct xfrm4_protocol ipcomp4_protocol = { .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = ipcomp4_rcv_cb, .err_handler = ipcomp4_err, - .no_policy = 1, - .netns_ok = 1, + .priority = 0, }; static int __init ipcomp4_init(void) @@ -170,7 +178,7 @@ static int __init ipcomp4_init(void) pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { + if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp_type, AF_INET); return -EAGAIN; @@ -180,7 +188,7 @@ static int __init ipcomp4_init(void) static void __exit ipcomp4_fini(void) { - if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) + if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 28863570dd6..d84dc8d4c91 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -455,7 +455,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) struct mr_table *mrt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, - .flowi4_iif = skb->skb_iif, + .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi4_mark = skb->mark, }; int err; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c3e0adea9c2..7ebd6e37875 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) skb_dst_set(skb, NULL); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); if (IS_ERR(dst)) - return PTR_ERR(dst);; + return PTR_ERR(dst); skb_dst_set(skb, dst); } #endif diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 59da7cde072..f95b6f93814 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 718dfbd30cb..99e810f8467 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index c49dcd0284a..4bfaedf9b34 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -89,11 +89,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ipv4_is_multicast(iph->daddr)) { if (ipv4_is_zeronet(iph->saddr)) return ipv4_is_local_multicast(iph->daddr) ^ invert; - flow.flowi4_iif = 0; - } else { - flow.flowi4_iif = LOOPBACK_IFINDEX; } - + flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_oif = 0; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 12e13bd82b5..f40f321b41f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,7 +22,6 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -/* Returns new sk_buff, or NULL */ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) { int err; @@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) err = ip_defrag(skb, user); local_bh_enable(); - if (!err) + if (!err) { ip_send_check(ip_hdr(skb)); + skb->local_df = 1; + } return err; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2d11c094296..044a0ddf6a7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -236,15 +236,15 @@ exit: static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, kgid_t *high) { - kgid_t *data = net->ipv4.sysctl_ping_group_range; + kgid_t *data = net->ipv4.ping_group_range.range; unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } @@ -252,26 +252,33 @@ int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); - struct group_info *group_info = get_current_groups(); - int i, j, count = group_info->ngroups; + struct group_info *group_info; + int i, j, count; kgid_t low, high; + int ret = 0; inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; + group_info = get_current_groups(); + count = group_info->ngroups; for (i = 0; i < group_info->nblocks; i++) { int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); for (j = 0; j < cp_count; j++) { kgid_t gid = group_info->blocks[i][j]; if (gid_lte(low, gid) && gid_lte(gid, high)) - return 0; + goto out_release_group; } count -= cp_count; } - return -EACCES; + ret = -EACCES; + +out_release_group: + put_group_info(group_info); + return ret; } EXPORT_SYMBOL_GPL(ping_init_sock); @@ -727,7 +734,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); if (err) return err; if (ipc.opt) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index a6c8a80ec9d..ad737fad6d8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), + SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), @@ -280,6 +281,11 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), + SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), + SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), + SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), + SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), + SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index c04518f4850..a9dbe58bdfe 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -524,7 +524,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); if (err) goto out; if (ipc.opt) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4c011ec69ed..5e676be3dae 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -139,11 +139,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); -static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) -{ -} - static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { WARN_ON(1); @@ -162,7 +157,6 @@ static struct dst_ops ipv4_dst_ops = { .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, - .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, @@ -194,7 +188,7 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) @@ -697,7 +691,6 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, out_unlock: spin_unlock_bh(&fnhe_lock); - return; } static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, @@ -1136,7 +1129,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1526,7 +1519,7 @@ static int __mkroute_input(struct sk_buff *skb, struct in_device *out_dev; unsigned int flags = 0; bool do_cache; - u32 itag; + u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); @@ -1707,8 +1700,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, - LOOPBACK_IFINDEX, - dev, in_dev, &itag); + 0, dev, in_dev, &itag); if (err < 0) goto martian_source_keep_err; goto local_input; @@ -2225,7 +2217,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; new->dev = ort->dst.dev; if (new->dev) @@ -2364,7 +2356,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, } } else #endif - if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) + if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) goto nla_put_failure; } @@ -2475,11 +2467,6 @@ errout_free: goto errout; } -int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - return skb->len; -} - void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 44eba052b43..5cde8f263d4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,10 +45,10 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { - write_seqlock(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = range[0]; - net->ipv4.sysctl_local_ports.range[1] = range[1]; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_seqlock(&net->ipv4.ip_local_ports.lock); + net->ipv4.ip_local_ports.range[0] = range[0]; + net->ipv4.ip_local_ports.range[1] = range[1]; + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -57,7 +57,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos) { struct net *net = - container_of(table->data, struct net, ipv4.sysctl_local_ports.range); + container_of(table->data, struct net, ipv4.ip_local_ports.range); int ret; int range[2]; struct ctl_table tmp = { @@ -87,14 +87,14 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low { kgid_t *data = table->data; struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); + container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } /* Update system visible IP port range */ @@ -102,11 +102,11 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig { kgid_t *data = table->data; struct net *net = - container_of(table->data, struct net, ipv4.sysctl_ping_group_range); - write_seqlock(&net->ipv4.sysctl_local_ports.lock); + container_of(table->data, struct net, ipv4.ping_group_range.range); + write_seqlock(&net->ipv4.ip_local_ports.lock); data[0] = low; data[1] = high; - write_sequnlock(&net->ipv4.sysctl_local_ports.lock); + write_sequnlock(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -805,7 +805,7 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ping_group_range", - .data = &init_net.ipv4.sysctl_ping_group_range, + .data = &init_net.ipv4.ping_group_range.range, .maxlen = sizeof(gid_t)*2, .mode = 0644, .proc_handler = ipv4_ping_group_range, @@ -819,8 +819,8 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ip_local_port_range", - .maxlen = sizeof(init_net.ipv4.sysctl_local_ports.range), - .data = &init_net.ipv4.sysctl_local_ports.range, + .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), + .data = &init_net.ipv4.ip_local_ports.range, .mode = 0644, .proc_handler = ipv4_local_port_range, }, @@ -858,20 +858,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table[i].data += (void *)net - (void *)&init_net; } - /* - * Sane defaults - nobody may create ping sockets. - * Boot scripts should set this to distro-specific group. - */ - net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); - net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); - - /* - * Set defaults for local port range - */ - seqlock_init(&net->ipv4.sysctl_local_ports.lock); - net->ipv4.sysctl_local_ports.range[0] = 32768; - net->ipv4.sysctl_local_ports.range[1] = 61000; - net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (net->ipv4.ipv4_hdr == NULL) goto err_reg; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 97c8f5620c4..4bd6d52eeff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -2341,7 +2341,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); - tp->srtt = 0; + tp->srtt_us = 0; if ((tp->write_seq += tp->max_window + 2) == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; @@ -2785,8 +2785,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; - info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; - info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; + info->tcpi_rtt = tp->srtt_us >> 3; + info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_advmss = tp->advmss; @@ -2796,6 +2796,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; + + info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? + sk->sk_pacing_rate : ~0ULL; + info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? + sk->sk_max_pacing_rate : ~0ULL; } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2388275adb9..2b9464c93b8 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -361,21 +361,12 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window with halving. */ -u32 tcp_reno_min_cwnd(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh/2; -} -EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); - struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; /* Initial congestion control used (until SYN) @@ -387,6 +378,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = { .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, }; EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 828e4c3ffba..b4f1b29b08b 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -409,7 +409,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; ratio += cnt; - ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); + ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ @@ -476,10 +476,6 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); - /* hystart needs ms clock resolution */ - if (hystart && HZ < 1000) - cubictcp.flags |= TCP_CONG_RTT_STAMP; - return tcp_register_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8ed9305dfdf..8b9e7bad77c 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -162,7 +162,6 @@ static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, .cong_avoid = hstcp_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .owner = THIS_MODULE, .name = "highspeed" diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 478fe82611b..a15a799bf76 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -21,7 +21,7 @@ struct hybla { u32 rho2; /* Rho * Rho, integer part */ u32 rho_3ls; /* Rho parameter, <<3 */ u32 rho2_7ls; /* Rho^2, <<7 */ - u32 minrtt; /* Minimum smoothed round trip time value seen */ + u32 minrtt_us; /* Minimum smoothed round trip time value seen */ }; /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ @@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk) { struct hybla *ca = inet_csk_ca(sk); - ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho_3ls = max_t(u32, + tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC), + 8U); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2 = ca->rho2_7ls >> 7; @@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk) hybla_recalc_param(sk); /* set minimum rtt as this is the 1st ever seen */ - ca->minrtt = tp->srtt; + ca->minrtt_us = tp->srtt_us; tp->snd_cwnd = ca->rho; } @@ -94,9 +96,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, int is_slowstart = 0; /* Recalculate rho only if this srtt is the lowest */ - if (tp->srtt < ca->minrtt){ + if (tp->srtt_us < ca->minrtt_us) { hybla_recalc_param(sk); - ca->minrtt = tp->srtt; + ca->minrtt_us = tp->srtt_us; } if (!tcp_is_cwnd_limited(sk, in_flight)) @@ -166,7 +168,6 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index e498a62b8f9..863d105e301 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -325,10 +325,8 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, } static struct tcp_congestion_ops tcp_illinois __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eeaac399420..3a26b3b23f1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -667,11 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); - long m = mrtt; /* RTT */ - u32 srtt = tp->srtt; + long m = mrtt_us; /* RTT */ + u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -694,7 +694,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -706,28 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) if (m > 0) m >>= 3; } else { - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ } - tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - if (tp->mdev > tp->mdev_max) { - tp->mdev_max = tp->mdev; - if (tp->mdev_max > tp->rttvar) - tp->rttvar = tp->mdev_max; + tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev_us > tp->mdev_max_us) { + tp->mdev_max_us = tp->mdev_us; + if (tp->mdev_max_us > tp->rttvar_us) + tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { - if (tp->mdev_max < tp->rttvar) - tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; + if (tp->mdev_max_us < tp->rttvar_us) + tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; - tp->mdev_max = tcp_rto_min(sk); + tp->mdev_max_us = tcp_rto_min_us(sk); } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ - tp->mdev = m << 1; /* make sure rto = 3*rtt */ - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ + tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); + tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; } - tp->srtt = max(1U, srtt); + tp->srtt_us = max(1U, srtt); } /* Set the sk_pacing_rate to allow proper sizing of TSO packets. @@ -742,20 +743,12 @@ static void tcp_update_pacing_rate(struct sock *sk) u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ - rate = (u64)tp->mss_cache * 2 * (HZ << 3); + rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); rate *= max(tp->snd_cwnd, tp->packets_out); - /* Correction for small srtt and scheduling constraints. - * For small rtt, consider noise is too high, and use - * the minimal value (srtt = 1 -> 125 us for HZ=1000) - * - * We probably need usec resolution in the future. - * Note: This also takes care of possible srtt=0 case, - * when tcp_rtt_estimator() was not yet called. - */ - if (tp->srtt > 8 + 2) - do_div(rate, tp->srtt); + if (likely(tp->srtt_us)) + do_div(rate, tp->srtt_us); /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store @@ -1122,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; - int flag; - s32 rtt; /* RTT measured by SACKing never-retransmitted data */ + int reord; + int fack_count; + long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + int flag; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1186,7 +1179,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount, u32 xmit_time) + int dup_sack, int pcount, + const struct skb_mstamp *xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; @@ -1227,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk, if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; /* Pick the earliest sequence sacked for RTT */ - if (state->rtt < 0) - state->rtt = tcp_time_stamp - xmit_time; + if (state->rtt_us < 0) { + struct skb_mstamp now; + + skb_mstamp_get(&now); + state->rtt_us = skb_mstamp_us_delta(&now, + xmit_time); + } } if (sacked & TCPCB_LOST) { @@ -1287,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - TCP_SKB_CB(skb)->when); + &skb->skb_mstamp); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1565,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - TCP_SKB_CB(skb)->when); + &skb->skb_mstamp); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1622,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una, s32 *sack_rtt) + u32 prior_snd_una, long *sack_rtt_us) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1640,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, state.flag = 0; state.reord = tp->packets_out; - state.rtt = -1; + state.rtt_us = -1L; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1824,7 +1823,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - *sack_rtt = state.rtt; + *sack_rtt_us = state.rtt_us; return state.flag; } @@ -2035,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * available, or RTO is scheduled to fire first. */ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt) + (flag & FLAG_ECE) || !tp->srtt_us) return false; - delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + delay = max(usecs_to_jiffies(tp->srtt_us >> 5), + msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) return false; @@ -2683,13 +2684,12 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) bool recovered = !before(tp->snd_una, tp->high_seq); if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - if (flag & FLAG_ORIG_SACK_ACKED) { - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - tcp_try_undo_loss(sk, true); + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) return; - } + if (after(tp->snd_nxt, tp->high_seq) && (flag & FLAG_DATA_SACKED || is_dupack)) { tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ @@ -2885,7 +2885,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - s32 seq_rtt, s32 sack_rtt) + long seq_rtt_us, long sack_rtt_us) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2895,10 +2895,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * is acked (RFC6298). */ if (flag & FLAG_RETRANS_DATA_ACKED) - seq_rtt = -1; + seq_rtt_us = -1L; - if (seq_rtt < 0) - seq_rtt = sack_rtt; + if (seq_rtt_us < 0) + seq_rtt_us = sack_rtt_us; /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -2906,14 +2906,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); - if (seq_rtt < 0) + if (seq_rtt_us < 0) return false; - tcp_rtt_estimator(sk, seq_rtt); + tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); /* RFC6298: only reset backoff on valid RTT measurement. */ @@ -2925,16 +2925,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) { struct tcp_sock *tp = tcp_sk(sk); - s32 seq_rtt = -1; + long seq_rtt_us = -1L; if (synack_stamp && !tp->total_retrans) - seq_rtt = tcp_time_stamp - synack_stamp; + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() */ - if (!tp->srtt) - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); + if (!tp->srtt_us) + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) @@ -3023,26 +3023,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, s32 sack_rtt) + u32 prior_snd_una, long sack_rtt_us) { - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - u32 now = tcp_time_stamp; + struct skb_mstamp first_ackt, last_ackt, now; + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + u32 reord = tp->packets_out; bool fully_acked = true; - int flag = 0; + long ca_seq_rtt_us = -1L; + long seq_rtt_us = -1L; + struct sk_buff *skb; u32 pkts_acked = 0; - u32 reord = tp->packets_out; - u32 prior_sacked = tp->sacked_out; - s32 seq_rtt = -1; - s32 ca_seq_rtt = -1; - ktime_t last_ackt = net_invalid_timestamp(); bool rtt_update; + int flag = 0; + + first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u32 acked_pcount; u8 sacked = scb->sacked; + u32 acked_pcount; /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { @@ -3064,11 +3065,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; } else { - ca_seq_rtt = now - scb->when; - last_ackt = skb->tstamp; - if (seq_rtt < 0) { - seq_rtt = ca_seq_rtt; - } + last_ackt = skb->skb_mstamp; + WARN_ON_ONCE(last_ackt.v64 == 0); + if (!first_ackt.v64) + first_ackt = last_ackt; + if (!(sacked & TCPCB_SACKED_ACKED)) reord = min(pkts_acked, reord); if (!after(scb->end_seq, tp->high_seq)) @@ -3114,7 +3115,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); + skb_mstamp_get(&now); + if (first_ackt.v64) { + seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); + ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops @@ -3142,25 +3149,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - s32 rtt_us = -1; - - /* Is the ACK triggering packet unambiguous? */ - if (!(flag & FLAG_RETRANS_DATA_ACKED)) { - /* High resolution needed and available? */ - if (ca_ops->flags & TCP_CONG_RTT_STAMP && - !ktime_equal(last_ackt, - net_invalid_timestamp())) - rtt_us = ktime_us_delta(ktime_get_real(), - last_ackt); - else if (ca_seq_rtt >= 0) - rtt_us = jiffies_to_usecs(ca_seq_rtt); - } + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } - } else if (skb && rtt_update && sack_rtt >= 0 && - sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { + } else if (skb && rtt_update && sack_rtt_us >= 0 && + sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3370,12 +3363,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; + u32 prior_in_flight; u32 prior_fackets; int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - s32 sack_rtt = -1; + long sack_rtt_us = -1L; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3433,7 +3426,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt); + &sack_rtt_us); if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; @@ -3452,7 +3445,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + sack_rtt_us); acked -= tp->packets_out; /* Advance cwnd if state allows */ @@ -3475,8 +3469,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) - tcp_update_pacing_rate(sk); + tcp_update_pacing_rate(sk); return 1; no_queue: @@ -3505,7 +3498,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt); + &sack_rtt_us); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -4419,7 +4412,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } @@ -4920,7 +4913,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); tp->urg_data = TCP_URG_VALID | tmp; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } } @@ -5006,11 +4999,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } else if (chunk > 0) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } out: return copied_early; @@ -5281,7 +5274,7 @@ no_ack: #endif if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } } @@ -5401,9 +5394,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, break; } tcp_rearm_rto(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; } tp->syn_data_acked = tp->syn_data; + if (tp->syn_data_acked) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); return false; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1e4eac779f5..438f3b95143 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -435,7 +435,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : + inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT) << icsk->icsk_backoff; tcp_bound_rto(sk); @@ -854,8 +854,10 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { int res = tcp_v4_send_synack(sk, NULL, req, 0); - if (!res) + if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + } return res; } @@ -878,8 +880,6 @@ bool tcp_syn_flood_action(struct sock *sk, bool want_cookie = false; struct listen_sock *lopt; - - #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { msg = "Sending cookies"; @@ -1434,7 +1434,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; tp->syn_data_acked = 1; } - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); bh_unlock_sock(child); sock_put(child); WARN_ON(req->sk == NULL); diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 991d62a2f9b..c9aecae3132 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -315,11 +315,9 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) } static struct tcp_congestion_ops tcp_lp __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_lp_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .pkts_acked = tcp_lp_pkts_acked, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index f7e522c558b..d4f015ad6c8 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -103,7 +103,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) } static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) + char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); unsigned long long val; @@ -219,7 +219,7 @@ static struct cftype tcp_files[] = { static int __init tcp_memcontrol_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); return 0; } __initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d547075d830..dcaf72f1021 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -33,6 +33,11 @@ struct tcp_fastopen_metrics { struct tcp_fastopen_cookie cookie; }; +/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility + * Kernel only stores RTT and RTTVAR in usec resolution + */ +#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) + struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; struct inetpeer_addr tcpm_saddr; @@ -41,7 +46,7 @@ struct tcp_metrics_block { u32 tcpm_ts; u32 tcpm_ts_stamp; u32 tcpm_lock; - u32 tcpm_vals[TCP_METRIC_MAX + 1]; + u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; struct tcp_fastopen_metrics tcpm_fastopen; struct rcu_head rcu_head; @@ -59,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm, return tm->tcpm_vals[idx]; } -static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, - enum tcp_metric_index idx) -{ - return msecs_to_jiffies(tm->tcpm_vals[idx]); -} - static void tcp_metric_set(struct tcp_metrics_block *tm, enum tcp_metric_index idx, u32 val) @@ -72,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, tm->tcpm_vals[idx] = val; } -static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, - enum tcp_metric_index idx, - u32 val) -{ - tm->tcpm_vals[idx] = jiffies_to_msecs(val); -} - static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { @@ -101,9 +93,11 @@ struct tcpm_hash_bucket { static DEFINE_SPINLOCK(tcp_metrics_lock); -static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, +static void tcpm_suck_dst(struct tcp_metrics_block *tm, + const struct dst_entry *dst, bool fastopen_clear) { + u32 msval; u32 val; tm->tcpm_stamp = jiffies; @@ -121,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, val |= 1 << TCP_METRIC_REORDERING; tm->tcpm_lock = val; - tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); - tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); + msval = dst_metric_raw(dst, RTAX_RTT); + tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; + + msval = dst_metric_raw(dst, RTAX_RTTVAR); + tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); @@ -384,7 +381,7 @@ void tcp_update_metrics(struct sock *sk) dst_confirm(dst); rcu_read_lock(); - if (icsk->icsk_backoff || !tp->srtt) { + if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. Reset our * results. @@ -399,8 +396,8 @@ void tcp_update_metrics(struct sock *sk) if (!tm) goto out_unlock; - rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); - m = rtt - tp->srtt; + rtt = tcp_metric_get(tm, TCP_METRIC_RTT); + m = rtt - tp->srtt_us; /* If newly calculated rtt larger than stored one, store new * one. Otherwise, use EWMA. Remember, rtt overestimation is @@ -408,10 +405,10 @@ void tcp_update_metrics(struct sock *sk) */ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { if (m <= 0) - rtt = tp->srtt; + rtt = tp->srtt_us; else rtt -= (m >> 3); - tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); + tcp_metric_set(tm, TCP_METRIC_RTT, rtt); } if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { @@ -422,16 +419,16 @@ void tcp_update_metrics(struct sock *sk) /* Scale deviation to rttvar fixed point */ m >>= 1; - if (m < tp->mdev) - m = tp->mdev; + if (m < tp->mdev_us) + m = tp->mdev_us; - var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); + var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); if (m >= var) var = m; else var -= (var - m) >> 2; - tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); + tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); } if (tcp_in_initial_slowstart(tp)) { @@ -528,7 +525,7 @@ void tcp_init_metrics(struct sock *sk) tp->reordering = val; } - crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); + crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); reset: /* The initial RTT measurement from the SYN/SYN-ACK is not ideal @@ -551,18 +548,20 @@ reset: * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ - if (crtt > tp->srtt) { + if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ - crtt >>= 3; + crtt /= 8 * USEC_PER_MSEC; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); - } else if (tp->srtt == 0) { + } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs * from the more aggressive 1sec to avoid more spurious * retransmission. */ - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; + tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); + tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been @@ -809,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); if (!nest) goto nla_put_failure; - for (i = 0; i < TCP_METRIC_MAX + 1; i++) { - if (!tm->tcpm_vals[i]) + for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { + u32 val = tm->tcpm_vals[i]; + + if (!val) continue; - if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) + if (i == TCP_METRIC_RTT) { + if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, + val) < 0) + goto nla_put_failure; + n++; + val = max(val / 1000, 1U); + } + if (i == TCP_METRIC_RTTVAR) { + if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, + val) < 0) + goto nla_put_failure; + n++; + val = max(val / 1000, 1U); + } + if (nla_put_u32(msg, i + 1, val) < 0) goto nla_put_failure; n++; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7a436c517e4..05c1b155251 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -398,8 +398,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_init_wl(newtp, treq->rcv_isn); - newtp->srtt = 0; - newtp->mdev = TCP_TIMEOUT_INIT; + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; @@ -745,7 +745,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, skb->len); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 17a11e65e57..12d6016bdd9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { tcp_rearm_rto(sk); } + + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, + tcp_skb_pcount(skb)); } /* SND.NXT, if window was not shrunk. @@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window); static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 old_win = tp->rcv_wnd; u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); @@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ + if (new_win == 0) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } tp->rcv_wnd = new_win; @@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk) new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ - if (new_win == 0) + if (new_win == 0) { tp->pred_flags = 0; + if (old_win) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTOZEROWINDOWADV); + } else if (old_win == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + } return new_win; } @@ -867,11 +880,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (clone_it) { const struct sk_buff *fclone = skb + 1; - /* If congestion control is doing timestamping, we must - * take such a timestamp before we potentially clone/copy. - */ - if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) - __net_timestamp(skb); + skb_mstamp_get(&skb->skb_mstamp); if (unlikely(skb->fclone == SKB_FCLONE_ORIG && fclone->fclone == SKB_FCLONE_CLONE)) @@ -884,6 +893,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -970,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) return err; @@ -1426,7 +1437,7 @@ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, * With Minshall's modification: all sent small packets are ACKed. */ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, - unsigned int mss_now, int nonagle) + int nonagle) { return partial && ((nonagle & TCP_NAGLE_CORK) || @@ -1458,7 +1469,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, * to include this last segment in this skb. * Otherwise, we'll split the skb at last MSS boundary */ - if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) + if (tcp_nagle_check(partial != 0, tp, nonagle)) return needed - partial; return needed; @@ -1521,7 +1532,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; - if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) + if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; @@ -1975,7 +1986,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, tlp_time_stamp, rto_time_stamp; - u32 rtt = tp->srtt >> 3; + u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) return false; @@ -1997,7 +2008,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; @@ -2082,7 +2093,6 @@ rearm_timer: if (likely(!err)) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); - return; } /* Push out any pending frames which were held back due to @@ -2180,7 +2190,8 @@ u32 __tcp_select_window(struct sock *sk) */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); - int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); + int allowed_space = tcp_full_space(sk); + int full_space = min_t(int, tp->window_clamp, allowed_space); int window; if (mss > full_space) @@ -2193,7 +2204,19 @@ u32 __tcp_select_window(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); - if (free_space < mss) + /* free_space might become our new window, make sure we don't + * increase it due to wscale. + */ + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + + /* if free space is less than mss estimate, or is below 1/16th + * of the maximum allowed, try to move to zero-window, else + * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and + * new incoming data is dropped due to memory limits. + * With large window, mss test triggers way too late in order + * to announce zero window in time before rmem limit kicks in. + */ + if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } @@ -2418,8 +2441,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } - if (likely(!err)) + if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + /* Update global TCP statistics. */ + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + tp->total_retrans++; + } return err; } @@ -2429,11 +2458,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err = __tcp_retransmit_skb(sk, skb); if (err == 0) { - /* Update global TCP statistics. */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - - tp->total_retrans++; - #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); @@ -2717,7 +2741,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, int tcp_header_size; int mss; - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); + skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2787,7 +2811,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); - TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -2959,9 +2983,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) tcp_connect_queue_skb(sk, data); fo->copied = data->len; + /* syn_data is about to be sent, we need to take current time stamps + * for the packets that are in write queue : SYN packet and DATA + */ + skb_mstamp_get(&syn->skb_mstamp); + data->skb_mstamp = syn->skb_mstamp; + if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { tp->syn_data = (fo->copied > 0); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } syn_data = NULL; @@ -3049,8 +3079,9 @@ void tcp_send_delayed_ack(struct sock *sk) * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ - if (tp->srtt) { - int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); + if (tp->srtt_us) { + int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), + TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 1f2d37613c9..3b66610d415 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -154,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, p->snd_wnd = tp->snd_wnd; p->rcv_wnd = tp->rcv_wnd; p->ssthresh = tcp_current_ssthresh(sk); - p->srtt = tp->srtt >> 3; + p->srtt = tp->srtt_us >> 3; tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 19ea6c2951f..0ac50836da4 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -39,7 +39,6 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .cong_avoid = tcp_scalable_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .owner = THIS_MODULE, .name = "scalable", diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 64f0354c84c..286227abed1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -165,6 +165,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true); + if (tp->syn_data) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; syn_set = true; diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 06cae62bf20..48539fff635 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -306,11 +306,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_vegas_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 326475a9486..1b8e28fcd7e 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -203,7 +203,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk) } static struct tcp_congestion_ops tcp_veno __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 76a1e23259e..b94a04ae2ed 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -276,7 +276,6 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_westwood_bw_rttmin, .cwnd_event = tcp_westwood_event, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 1a8d271f994..5ede0e72794 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -227,11 +227,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { } static struct tcp_congestion_ops tcp_yeah __read_mostly = { - .flags = TCP_CONG_RTT_STAMP, .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, .cong_avoid = tcp_yeah_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 77bd16fa9f3..4468e1adc09 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -931,7 +931,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc, + sk->sk_family == AF_INET6); if (err) return err; if (ipc.opt) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 1f12c8b4586..aac6197b7a7 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -37,15 +37,6 @@ drop: return NET_RX_DROP; } -int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) -{ - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - return xfrm_input(skb, nexthdr, spi, encap_type); -} -EXPORT_SYMBOL(xfrm4_rcv_encap); - int xfrm4_transport_finish(struct sk_buff *skb, int async) { struct iphdr *iph = ip_hdr(skb); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 31b18152528..05f2b484954 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -15,65 +15,6 @@ #include <net/ip.h> #include <net/xfrm.h> -/* Informational hook. The decap is still done here. */ -static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly; -static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); - -int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler) -{ - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; - int ret = -EEXIST; - int priority = handler->priority; - - mutex_lock(&xfrm4_mode_tunnel_input_mutex); - - for (pprev = &rcv_notify_handlers; - (t = rcu_dereference_protected(*pprev, - lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; - pprev = &t->next) { - if (t->priority > priority) - break; - if (t->priority == priority) - goto err; - - } - - handler->next = *pprev; - rcu_assign_pointer(*pprev, handler); - - ret = 0; - -err: - mutex_unlock(&xfrm4_mode_tunnel_input_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); - -int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler) -{ - struct xfrm_tunnel_notifier __rcu **pprev; - struct xfrm_tunnel_notifier *t; - int ret = -ENOENT; - - mutex_lock(&xfrm4_mode_tunnel_input_mutex); - for (pprev = &rcv_notify_handlers; - (t = rcu_dereference_protected(*pprev, - lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; - pprev = &t->next) { - if (t == handler) { - *pprev = handler->next; - ret = 0; - break; - } - } - mutex_unlock(&xfrm4_mode_tunnel_input_mutex); - synchronize_net(); - - return ret; -} -EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister); - static inline void ipip_ecn_decapsulate(struct sk_buff *skb) { struct iphdr *inner_iph = ipip_hdr(skb); @@ -127,14 +68,8 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -#define for_each_input_rcu(head, handler) \ - for (handler = rcu_dereference(head); \ - handler != NULL; \ - handler = rcu_dereference(handler->next)) - static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct xfrm_tunnel_notifier *handler; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) @@ -143,9 +78,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; - for_each_input_rcu(rcv_notify_handlers, handler) - handler->handler(skb); - err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index baa0f63731f..186a8ecf92f 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -62,10 +62,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) if (err) return err; - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; - - skb->protocol = htons(ETH_P_IP); + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; return x->outer_mode->output2(x, skb); } @@ -73,27 +70,34 @@ EXPORT_SYMBOL(xfrm4_prepare_output); int xfrm4_output_finish(struct sk_buff *skb) { -#ifdef CONFIG_NETFILTER - if (!skb_dst(skb)->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); - } + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + skb->protocol = htons(ETH_P_IP); +#ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; #endif - skb->protocol = htons(ETH_P_IP); return xfrm_output(skb); } -int xfrm4_output(struct sk_buff *skb) +static int __xfrm4_output(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x = skb_dst(skb)->xfrm; + +#ifdef CONFIG_NETFILTER + if (!x) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + return x->outer_mode->afinfo->output_finish(skb); +} + +int xfrm4_output(struct sock *sk, struct sk_buff *skb) +{ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, dst->dev, - x->outer_mode->afinfo->output_finish, + NULL, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index e1a63930a96..6156f68a1e9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -325,6 +325,7 @@ void __init xfrm4_init(void) xfrm4_state_init(); xfrm4_policy_init(); + xfrm4_protocol_init(); #ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm4_net_ops); #endif diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c new file mode 100644 index 00000000000..a2ce0101eaa --- /dev/null +++ b/net/ipv4/xfrm4_protocol.c @@ -0,0 +1,301 @@ +/* xfrm4_protocol.c - Generic xfrm protocol multiplexer. + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv4/tunnel4.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly; +static DEFINE_MUTEX(xfrm4_protocol_mutex); + +static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp4_handlers; + case IPPROTO_AH: + return &ah4_handlers; + case IPPROTO_COMP: + return &ipcomp4_handlers; + } + + return NULL; +} + +#define for_each_protocol_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +{ + int ret; + struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(protocol); + + if (!head) + return 0; + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->cb_handler(skb, err)) <= 0) + return ret; + + return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_cb); + +int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + int ret; + struct xfrm4_protocol *handler; + struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr); + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + if (!head) + goto out; + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) + return ret; + +out: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_encap); + +static int xfrm4_esp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(esp4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(esp4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static int xfrm4_ah_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(ah4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret;; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(ah4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static int xfrm4_ipcomp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm4_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + + for_each_protocol_rcu(ipcomp4_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +{ + struct xfrm4_protocol *handler; + + for_each_protocol_rcu(ipcomp4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +static const struct net_protocol esp4_protocol = { + .handler = xfrm4_esp_rcv, + .err_handler = xfrm4_esp_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol ah4_protocol = { + .handler = xfrm4_ah_rcv, + .err_handler = xfrm4_ah_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol ipcomp4_protocol = { + .handler = xfrm4_ipcomp_rcv, + .err_handler = xfrm4_ipcomp_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static struct xfrm_input_afinfo xfrm4_input_afinfo = { + .family = AF_INET, + .owner = THIS_MODULE, + .callback = xfrm4_rcv_cb, +}; + +static inline const struct net_protocol *netproto(unsigned char protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp4_protocol; + case IPPROTO_AH: + return &ah4_protocol; + case IPPROTO_COMP: + return &ipcomp4_protocol; + } + + return NULL; +} + +int xfrm4_protocol_register(struct xfrm4_protocol *handler, + unsigned char protocol) +{ + struct xfrm4_protocol __rcu **pprev; + struct xfrm4_protocol *t; + bool add_netproto = false; + int ret = -EEXIST; + int priority = handler->priority; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm4_protocol_mutex); + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm4_protocol_mutex))) + add_netproto = true; + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t->priority < priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&xfrm4_protocol_mutex); + + if (add_netproto) { + if (inet_add_protocol(netproto(protocol), protocol)) { + pr_err("%s: can't add protocol\n", __func__); + ret = -EAGAIN; + } + } + + return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_register); + +int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, + unsigned char protocol) +{ + struct xfrm4_protocol __rcu **pprev; + struct xfrm4_protocol *t; + int ret = -ENOENT; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm4_protocol_mutex); + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm4_protocol_mutex))) { + if (inet_del_protocol(netproto(protocol), protocol) < 0) { + pr_err("%s: can't remove protocol\n", __func__); + ret = -EAGAIN; + } + } + + mutex_unlock(&xfrm4_protocol_mutex); + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_deregister); + +void __init xfrm4_protocol_init(void) +{ + xfrm_input_register_afinfo(&xfrm4_input_afinfo); +} +EXPORT_SYMBOL(xfrm4_protocol_init); |