From 55d8694fa82c9b5858ae5a78a210353961f908f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:32 +0200 Subject: net: tcp: assign tcp cong_ops when tcp sk is created Split assignment and initialization from one into two functions. This is required by followup patches that add Datacenter TCP (DCTCP) congestion control algorithm - we need to be able to determine if the connection is moderated by DCTCP before the 3WHS has finished. As we walk the available congestion control list during the assignment, we are always guaranteed to have Reno present as it's fixed compiled-in. Therefore, since we're doing the early assignment, we don't have a real use for the Reno alias tcp_init_congestion_ops anymore and can thus remove it. Actual usage of the congestion control operations are being made after the 3WHS has finished, in some cases however we can access get_info() via diag if implemented, therefore we need to zero out the private area for those modules. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 02a9a2c366b..f99b0c072ee 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -824,6 +824,7 @@ struct tcp_congestion_ops { int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); +void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); int tcp_set_default_congestion_control(const char *name); @@ -835,7 +836,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name); int tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); -extern struct tcp_congestion_ops tcp_init_congestion_ops; u32 tcp_reno_ssthresh(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; -- cgit v1.2.3-70-g09d2 From 30e502a34b8b21fae2c789da102bd9f6e99fef83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2014 22:37:33 +0200 Subject: net: tcp: add flag for ca to indicate that ECN is required This patch adds a flag to TCP congestion algorithms that allows for requesting to mark IPv4/IPv6 sockets with transport as ECN capable, that is, ECT(0), when required by a congestion algorithm. It is currently used and needed in DataCenter TCP (DCTCP), as it requires both peers to assert ECT on all IP packets sent - it uses ECN feedback (i.e. CE, Congestion Encountered information) from switches inside the data center to derive feedback to the end hosts. Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's algorithm/behaviour slightly diverges from RFC3168, therefore this is only (!) enabled iff the assigned congestion control ops module has requested this. By that, we can tightly couple this logic really only to the provided congestion control ops. Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 61 +++++++++++++++++++++++++++++++++++++-------------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 25 +++++++++++++++------ 3 files changed, 63 insertions(+), 25 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index f99b0c072ee..a12f145cfbc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -733,23 +733,6 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set - * - * If we receive a SYN packet with these bits set, it means a network is - * playing bad games with TOS bits. In order to avoid possible false congestion - * notifications, we disable TCP ECN negociation. - */ -static inline void -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, - struct net *net) -{ - const struct tcphdr *th = tcp_hdr(skb); - - if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr && - INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield)) - inet_rsk(req)->ecn_ok = 1; -} - /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ @@ -791,7 +774,10 @@ enum tcp_ca_event { #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) +/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED 0x1 +/* Requires ECN/ECT set on all packets */ +#define TCP_CONG_NEEDS_ECN 0x2 struct tcp_congestion_ops { struct list_head list; @@ -840,6 +826,13 @@ u32 tcp_reno_ssthresh(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; +static inline bool tcp_ca_needs_ecn(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; +} + static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -857,6 +850,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set + * + * If we receive a SYN packet with these bits set, it means a + * network is playing bad games with TOS bits. In order to + * avoid possible false congestion notifications, we disable + * TCP ECN negociation. + * + * Exception: tcp_ca wants ECN. This is required for DCTCP + * congestion control; it requires setting ECT on all packets, + * including SYN. We inverse the test in this case: If our + * local socket wants ECN, but peer only set ece/cwr (but not + * ECT in IP header) its probably a non-DCTCP aware sender. + */ +static inline void +TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, + const struct sock *listen_sk) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct net *net = sock_net(listen_sk); + bool th_ecn = th->ece && th->cwr; + bool ect, need_ecn; + + if (!th_ecn) + return; + + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); + need_ecn = tcp_ca_needs_ecn(listen_sk); + + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + inet_rsk(req)->ecn_ok = 1; + else if (ect && need_ecn) + inet_rsk(req)->ecn_ok = 1; +} + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5073eefa6fa..fb0fe97e1c5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5944,7 +5944,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); + TCP_ECN_create_request(req, skb, sk); if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4d92703df4c..20e73271d75 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ @@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk)) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } } static __inline__ void -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, + struct sock *sk) { - if (inet_rsk(req)->ecn_ok) + if (inet_rsk(req)->ecn_ok) { th->ece = 1; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else { + } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } @@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk) } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; - TCP_ECN_send_synack(tcp_sk(sk), skb); + TCP_ECN_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - TCP_ECN_make_synack(req, th); + TCP_ECN_make_synack(req, th, sk); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is -- cgit v1.2.3-70-g09d2 From 7354c8c389d18719dd71cc810da70b0921d66694 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:34 +0200 Subject: net: tcp: split ack slow/fast events from cwnd_event The congestion control ops "cwnd_event" currently supports CA_EVENT_FAST_ACK and CA_EVENT_SLOW_ACK events (among others). Both FAST and SLOW_ACK are only used by Westwood congestion control algorithm. This removes both flags from cwnd_event and adds a new in_ack_event callback for this. The goal is to be able to provide more detailed information about ACKs, such as whether ECE flag was set, or whether the ACK resulted in a window update. It is required for DataCenter TCP (DCTCP) congestion control algorithm as it makes a different choice depending on ECE being set or not. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++-- net/ipv4/tcp_input.c | 12 ++++++++++-- net/ipv4/tcp_westwood.c | 28 ++++++++++++++++------------ 3 files changed, 32 insertions(+), 16 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index a12f145cfbc..7ec6a28305c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -763,8 +763,10 @@ enum tcp_ca_event { CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ - CA_EVENT_FAST_ACK, /* in sequence ack */ - CA_EVENT_SLOW_ACK, /* other ack */ +}; + +enum tcp_ca_ack_event_flags { + CA_ACK_SLOWPATH = (1 << 0), }; /* @@ -796,6 +798,8 @@ struct tcp_congestion_ops { void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ + void (*in_ack_event)(struct sock *sk, u32 flags); /* new value of cwnd after loss (optional) */ u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb0fe97e1c5..8a38774cc66 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3362,6 +3362,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } +static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->in_ack_event) + icsk->icsk_ca_ops->in_ack_event(sk, flags); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3421,7 +3429,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_ca_event(sk, CA_EVENT_FAST_ACK); + tcp_in_ack_event(sk, 0); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { @@ -3439,7 +3447,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - tcp_ca_event(sk, CA_EVENT_SLOW_ACK); + tcp_in_ack_event(sk, CA_ACK_SLOWPATH); } /* We passed data and got it acked, remove any soft error diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 81911a92356..bb63fba47d4 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } +static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) +{ + if (ack_flags & CA_ACK_SLOWPATH) { + struct westwood *w = inet_csk_ca(sk); + + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); + + update_rtt_min(w); + return; + } + + westwood_fast_bw(sk); +} + static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); switch (event) { - case CA_EVENT_FAST_ACK: - westwood_fast_bw(sk); - break; - case CA_EVENT_COMPLETE_CWR: tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; - case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; break; - - case CA_EVENT_SLOW_ACK: - westwood_update_window(sk); - w->bk += westwood_acked_count(sk); - update_rtt_min(w); - break; - default: /* don't care */ break; @@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .cwnd_event = tcp_westwood_event, + .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, -- cgit v1.2.3-70-g09d2 From 9890092e46b2996bb85f7f973e69424cb5c07bc0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:35 +0200 Subject: net: tcp: more detailed ACK events and events for CE marked packets DataCenter TCP (DCTCP) determines cwnd growth based on ECN information and ACK properties, e.g. ACK that updates window is treated differently than DUPACK. Also DCTCP needs information whether ACK was delayed ACK. Furthermore, DCTCP also implements a CE state machine that keeps track of CE markings of incoming packets. Therefore, extend the congestion control framework to provide these event types, so that DCTCP can be properly implemented as a normal congestion algorithm module outside of the core stack. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 9 ++++++++- net/ipv4/tcp_input.c | 22 ++++++++++++++++++---- net/ipv4/tcp_output.c | 4 ++++ 3 files changed, 30 insertions(+), 5 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7ec6a28305c..1f57c536349 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -763,10 +763,17 @@ enum tcp_ca_event { CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ + CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ + CA_EVENT_NON_DELAYED_ACK, }; +/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { - CA_ACK_SLOWPATH = (1 << 0), + CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ + CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ + CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ }; /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8a38774cc66..fc133178c78 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode((struct sock *)tp); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } - /* fallinto */ + tp->ecn_flags |= TCP_ECN_SEEN; + break; default: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; + break; } } @@ -3429,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, 0); + tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { + u32 ack_ev_flags = CA_ACK_SLOWPATH; + if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3444,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_rtt_us); - if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) + if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; + ack_ev_flags |= CA_ACK_ECE; + } + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_SLOWPATH); + tcp_in_ack_event(sk, ack_ev_flags); } /* We passed data and got it acked, remove any soft error diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20e73271d75..124f9e4e459 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3130,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; + tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); + if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3186,6 +3188,8 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; + tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); + /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. -- cgit v1.2.3-70-g09d2