From 255cac91c3c9ce7dca7713b93ab03c75b7902e0e Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Mon, 4 May 2009 11:07:36 -0700 Subject: tcp: extend ECN sysctl to allow server-side only ECN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should be very safe compared with full enabled, so I see no reason why it shouldn't be done right away. As ECN can only be negotiated if the SYN sending party is also supporting it, somebody in the loop probably knows what he/she is doing. If SYN does not ask for ECN, the server side SYN-ACK is identical to what it is without ECN. Thus it's quite safe. The chosen value is safe w.r.t to existing configs which choose to currently set manually either 0 or 1 but silently upgrades those who have not explicitly requested ECN off. Whether to just enable both sides comes up time to time but unless that gets done now we can at least make the servers aware of ECN already. As there are some known problems to occur if ECN is enabled, it's currently questionable whether there's any real gain from enabling clients as servers mostly won't support it anyway (so we'd hit just the negative sides). After enabling the servers and getting that deployed, the client end enable really has some potential gain too. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c96a6bb2543..56dcf97a97f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -77,7 +77,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; -int sysctl_tcp_ecn __read_mostly; +int sysctl_tcp_ecn __read_mostly = 2; int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 2; -- cgit v1.2.3-70-g09d2 From 915219441d566f1da0caa0e262be49b666159e17 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 28 May 2009 21:35:47 -0700 Subject: tcp: Use SKB queue and list helpers instead of doing it by-hand. Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 17 +++----- net/ipv4/tcp_input.c | 118 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 90 insertions(+), 45 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0fb8b441f1f..17b89c523f9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -439,12 +439,14 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) !tp->urg_data || before(tp->urg_seq, tp->copied_seq) || !before(tp->urg_seq, tp->rcv_nxt)) { + struct sk_buff *skb; + answ = tp->rcv_nxt - tp->copied_seq; /* Subtract 1, if FIN is in queue. */ - if (answ && !skb_queue_empty(&sk->sk_receive_queue)) - answ -= - tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin; + skb = skb_peek_tail(&sk->sk_receive_queue); + if (answ && skb) + answ -= tcp_hdr(skb)->fin; } else answ = tp->urg_seq - tp->copied_seq; release_sock(sk); @@ -1382,11 +1384,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Next get a buffer. */ - skb = skb_peek(&sk->sk_receive_queue); - do { - if (!skb) - break; - + skb_queue_walk(&sk->sk_receive_queue, skb) { /* Now that we have two receive queues this * shouldn't happen. */ @@ -1403,8 +1401,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (tcp_hdr(skb)->fin) goto found_fin_ok; WARN_ON(!(flags & MSG_PEEK)); - skb = skb->next; - } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + } /* Well, if we have backlog, try to process it now yet. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eeb8a92aa41..ba34a23c1bf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4426,7 +4426,7 @@ drop: } __skb_queue_head(&tp->out_of_order_queue, skb); } else { - struct sk_buff *skb1 = tp->out_of_order_queue.prev; + struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue); u32 seq = TCP_SKB_CB(skb)->seq; u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -4443,15 +4443,18 @@ drop: } /* Find place to insert this segment. */ - do { + while (1) { if (!after(TCP_SKB_CB(skb1)->seq, seq)) break; - } while ((skb1 = skb1->prev) != - (struct sk_buff *)&tp->out_of_order_queue); + if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { + skb1 = NULL; + break; + } + skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); + } /* Do skb overlap to previous one? */ - if (skb1 != (struct sk_buff *)&tp->out_of_order_queue && - before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ __kfree_skb(skb); @@ -4463,24 +4466,41 @@ drop: tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); } else { - skb1 = skb1->prev; + if (skb_queue_is_first(&tp->out_of_order_queue, + skb1)) + skb1 = NULL; + else + skb1 = skb_queue_prev( + &tp->out_of_order_queue, + skb1); } } - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + if (!skb1) + __skb_queue_head(&tp->out_of_order_queue, skb); + else + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); /* And clean segments covered by new one as whole. */ - while ((skb1 = skb->next) != - (struct sk_buff *)&tp->out_of_order_queue && - after(end_seq, TCP_SKB_CB(skb1)->seq)) { - if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + if (skb1 && !skb_queue_is_last(&tp->out_of_order_queue, skb1)) { + struct sk_buff *n; + + skb1 = skb_queue_next(&tp->out_of_order_queue, skb1); + skb_queue_walk_from_safe(&tp->out_of_order_queue, + skb1, n) { + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + if (before(end_seq, + TCP_SKB_CB(skb1)->end_seq)) { + tcp_dsack_extend(sk, + TCP_SKB_CB(skb1)->seq, + end_seq); + break; + } + __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, - end_seq); - break; + TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); } - __skb_unlink(skb1, &tp->out_of_order_queue); - tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, - TCP_SKB_CB(skb1)->end_seq); - __kfree_skb(skb1); } add_sack: @@ -4492,7 +4512,10 @@ add_sack: static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff *next = skb->next; + struct sk_buff *next = NULL; + + if (!skb_queue_is_last(list, skb)) + next = skb_queue_next(list, skb); __skb_unlink(skb, list); __kfree_skb(skb); @@ -4503,6 +4526,9 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. + * + * If tail is NULL, this means until the end of the list. + * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ @@ -4511,15 +4537,23 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { - struct sk_buff *skb; + struct sk_buff *skb, *n; + bool end_of_skbs; /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ - for (skb = head; skb != tail;) { + skb = head; +restart: + end_of_skbs = true; + skb_queue_walk_from_safe(list, skb, n) { + if (skb == tail) + break; /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list); - continue; + if (!skb) + break; + goto restart; } /* The first skb to collapse is: @@ -4529,16 +4563,24 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, */ if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && (tcp_win_from_space(skb->truesize) > skb->len || - before(TCP_SKB_CB(skb)->seq, start) || - (skb->next != tail && - TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq))) + before(TCP_SKB_CB(skb)->seq, start))) { + end_of_skbs = false; break; + } + + if (!skb_queue_is_last(list, skb)) { + struct sk_buff *next = skb_queue_next(list, skb); + if (next != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { + end_of_skbs = false; + break; + } + } /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; - skb = skb->next; } - if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) + if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; while (before(start, end)) { @@ -4583,7 +4625,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list); - if (skb == tail || + if (!skb || + skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; @@ -4610,17 +4653,21 @@ static void tcp_collapse_ofo_queue(struct sock *sk) head = skb; for (;;) { - skb = skb->next; + struct sk_buff *next = NULL; + + if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) + next = skb_queue_next(&tp->out_of_order_queue, skb); + skb = next; /* Segment is terminated when we see gap or when * we are at the end of all the queue. */ - if (skb == (struct sk_buff *)&tp->out_of_order_queue || + if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { tcp_collapse(sk, &tp->out_of_order_queue, head, skb, start, end); head = skb; - if (skb == (struct sk_buff *)&tp->out_of_order_queue) + if (!skb) break; /* Start new segment */ start = TCP_SKB_CB(skb)->seq; @@ -4681,10 +4728,11 @@ static int tcp_prune_queue(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); - tcp_collapse(sk, &sk->sk_receive_queue, - sk->sk_receive_queue.next, - (struct sk_buff *)&sk->sk_receive_queue, - tp->copied_seq, tp->rcv_nxt); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, + skb_peek(&sk->sk_receive_queue), + NULL, + tp->copied_seq, tp->rcv_nxt); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) -- cgit v1.2.3-70-g09d2 From 2df9001edc382c331f338f45d259feeaa740c418 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 29 May 2009 15:02:29 -0700 Subject: tcp: fix loop in ofo handling code and reduce its complexity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Somewhat luckily, I was looking into these parts with very fine comb because I've made somewhat similar changes on the same area (conflicts that arose weren't that lucky though). The loop was very much overengineered recently in commit 915219441d566 (tcp: Use SKB queue and list helpers instead of doing it by-hand), while it basically just wants to know if there are skbs after 'skb'. Also it got broken because skb1 = skb->next got translated into skb1 = skb1->next (though abstracted) improperly. Note that 'skb1' is pointing to previous sk_buff than skb or NULL if at head. Two things went wrong: - We'll kfree 'skb' on the first iteration instead of the skbuff following 'skb' (it would require required SACK reneging to recover I think). - The list head case where 'skb1' is NULL is checked too early and the loop won't execute whereas it previously did. Conclusion, mostly revert the recent changes which makes the cset very messy looking but using proper accessor in the previous-like version. The effective changes against the original can be viewed with: git-diff 915219441d566f1da0caa0e262be49b666159e17^ \ net/ipv4/tcp_input.c | sed -n -e '57,70 p' Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ba34a23c1bf..2bdb0da237e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4481,26 +4481,20 @@ drop: __skb_queue_after(&tp->out_of_order_queue, skb1, skb); /* And clean segments covered by new one as whole. */ - if (skb1 && !skb_queue_is_last(&tp->out_of_order_queue, skb1)) { - struct sk_buff *n; + while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { + skb1 = skb_queue_next(&tp->out_of_order_queue, skb); - skb1 = skb_queue_next(&tp->out_of_order_queue, skb1); - skb_queue_walk_from_safe(&tp->out_of_order_queue, - skb1, n) { - if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) - break; - if (before(end_seq, - TCP_SKB_CB(skb1)->end_seq)) { - tcp_dsack_extend(sk, - TCP_SKB_CB(skb1)->seq, - end_seq); - break; - } - __skb_unlink(skb1, &tp->out_of_order_queue); + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, - TCP_SKB_CB(skb1)->end_seq); - __kfree_skb(skb1); + end_seq); + break; } + __skb_unlink(skb1, &tp->out_of_order_queue); + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); } add_sack: -- cgit v1.2.3-70-g09d2