6 files changed, 122 insertions, 34 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 21e5e32d8c6..2b79377b468 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -351,12 +351,13 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
 					  inet->inet_dport);
 }
 
-void __inet_hash_nolisten(struct sock *sk)
+int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct hlist_nulls_head *list;
 	spinlock_t *lock;
 	struct inet_ehash_bucket *head;
+	int twrefcnt = 0;
 
 	WARN_ON(!sk_unhashed(sk));
 
@@ -367,8 +368,13 @@ void __inet_hash_nolisten(struct sock *sk)
 
 	spin_lock(lock);
 	__sk_nulls_add_node_rcu(sk, list);
+	if (tw) {
+		WARN_ON(sk->sk_hash != tw->tw_hash);
+		twrefcnt = inet_twsk_unhash(tw);
+	}
 	spin_unlock(lock);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	return twrefcnt;
 }
 EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
 
@@ -378,7 +384,7 @@ static void __inet_hash(struct sock *sk)
 	struct inet_listen_hashbucket *ilb;
 
 	if (sk->sk_state != TCP_LISTEN) {
-		__inet_hash_nolisten(sk);
+		__inet_hash_nolisten(sk, NULL);
 		return;
 	}
 
@@ -427,7 +433,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		struct sock *sk, u32 port_offset,
 		int (*check_established)(struct inet_timewait_death_row *,
 			struct sock *, __u16, struct inet_timewait_sock **),
-		void (*hash)(struct sock *sk))
+		int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	const unsigned short snum = inet_sk(sk)->inet_num;
@@ -435,6 +441,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	struct inet_bind_bucket *tb;
 	int ret;
 	struct net *net = sock_net(sk);
+	int twrefcnt = 1;
 
 	if (!snum) {
 		int i, remaining, low, high, port;
@@ -493,13 +500,18 @@ ok:
 		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
 			inet_sk(sk)->inet_sport = htons(port);
-			hash(sk);
+			twrefcnt += hash(sk, tw);
 		}
+		if (tw)
+			twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
 		spin_unlock(&head->lock);
 
 		if (tw) {
 			inet_twsk_deschedule(tw, death_row);
-			inet_twsk_put(tw);
+			while (twrefcnt) {
+				twrefcnt--;
+				inet_twsk_put(tw);
+			}
 		}
 
 		ret = 0;
@@ -510,7 +522,7 @@ ok:
 	tb  = inet_csk(sk)->icsk_bind_hash;
 	spin_lock_bh(&head->lock);
 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		hash(sk);
+		hash(sk, NULL);
 		spin_unlock_bh(&head->lock);
 		return 0;
 	} else {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 0fdf45e4c90..cc94cc2d8b2 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -15,9 +15,13 @@
 #include <net/ip.h>
 
 
-/*
- * unhash a timewait socket from established hash
- * lock must be hold by caller
+/**
+ *	inet_twsk_unhash - unhash a timewait socket from established hash
+ *	@tw: timewait socket
+ *
+ *	unhash a timewait socket from established hash, if hashed.
+ *	ehash lock must be held by caller.
+ *	Returns 1 if caller should call inet_twsk_put() after lock release.
  */
 int inet_twsk_unhash(struct inet_timewait_sock *tw)
 {
@@ -26,6 +30,37 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
 
 	hlist_nulls_del_rcu(&tw->tw_node);
 	sk_nulls_node_init(&tw->tw_node);
+	/*
+	 * We cannot call inet_twsk_put() ourself under lock,
+	 * caller must call it for us.
+	 */
+	return 1;
+}
+
+/**
+ *	inet_twsk_bind_unhash - unhash a timewait socket from bind hash
+ *	@tw: timewait socket
+ *	@hashinfo: hashinfo pointer
+ *
+ *	unhash a timewait socket from bind hash, if hashed.
+ *	bind hash lock must be held by caller.
+ *	Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+			  struct inet_hashinfo *hashinfo)
+{
+	struct inet_bind_bucket *tb = tw->tw_tb;
+
+	if (!tb)
+		return 0;
+
+	__hlist_del(&tw->tw_bind_node);
+	tw->tw_tb = NULL;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	/*
+	 * We cannot call inet_twsk_put() ourself under lock,
+	 * caller must call it for us.
+	 */
 	return 1;
 }
 
@@ -34,7 +69,6 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 			     struct inet_hashinfo *hashinfo)
 {
 	struct inet_bind_hashbucket *bhead;
-	struct inet_bind_bucket *tb;
 	int refcnt;
 	/* Unlink from established hashes. */
 	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
@@ -46,15 +80,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 	/* Disassociate with bind bucket. */
 	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
 			hashinfo->bhash_size)];
+
 	spin_lock(&bhead->lock);
-	tb = tw->tw_tb;
-	if (tb) {
-		__hlist_del(&tw->tw_bind_node);
-		tw->tw_tb = NULL;
-		inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
-		refcnt++;
-	}
+	refcnt += inet_twsk_bind_unhash(tw, hashinfo);
 	spin_unlock(&bhead->lock);
+
 #ifdef SOCK_REFCNT_DEBUG
 	if (atomic_read(&tw->tw_refcnt) != 1) {
 		printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
@@ -126,7 +156,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 
 	/*
 	 * Notes :
- 	 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
+	 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
 	 * - We add one reference for the bhash link
 	 * - We add one reference for the ehash link
 	 * - We want this refcnt update done before allowing other
@@ -136,7 +166,6 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 
 	spin_unlock(lock);
 }
-
 EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
@@ -177,7 +206,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 
 	return tw;
 }
-
 EXPORT_SYMBOL_GPL(inet_twsk_alloc);
 
 /* Returns non-zero if quota exceeded.  */
@@ -256,7 +284,6 @@ void inet_twdr_hangman(unsigned long data)
 out:
 	spin_unlock(&twdr->death_lock);
 }
-
 EXPORT_SYMBOL_GPL(inet_twdr_hangman);
 
 void inet_twdr_twkill_work(struct work_struct *work)
@@ -287,7 +314,6 @@ void inet_twdr_twkill_work(struct work_struct *work)
 		spin_unlock_bh(&twdr->death_lock);
 	}
 }
-
 EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
 
 /* These are always called from BH context.  See callers in
@@ -307,7 +333,6 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw,
 	spin_unlock(&twdr->death_lock);
 	__inet_twsk_kill(tw, twdr->hashinfo);
 }
-
 EXPORT_SYMBOL(inet_twsk_deschedule);
 
 void inet_twsk_schedule(struct inet_timewait_sock *tw,
@@ -388,7 +413,6 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
 		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
 	spin_unlock(&twdr->death_lock);
 }
-
 EXPORT_SYMBOL_GPL(inet_twsk_schedule);
 
 void inet_twdr_twcal_tick(unsigned long data)
@@ -449,7 +473,6 @@ out:
 #endif
 	spin_unlock(&twdr->death_lock);
 }
-
 EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
 
 void inet_twsk_purge(struct inet_hashinfo *hashinfo,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c8666b70cde..b0a26bb25e2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2540,11 +2540,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			ctd.tcpct_cookie_desired = cvp->cookie_desired;
 			ctd.tcpct_s_data_desired = cvp->s_data_desired;
 
-			/* Cookie(s) saved, return as nonce */
-			if (sizeof(ctd.tcpct_value) < cvp->cookie_pair_size) {
-				/* impossible? */
-				return -EINVAL;
-			}
 			memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
 			       cvp->cookie_pair_size);
 			ctd.tcpct_used = cvp->cookie_pair_size;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 57ae96a0422..12cab7d74db 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2717,6 +2717,35 @@ static void tcp_try_undo_dsack(struct sock *sk)
 	}
 }
 
+/* We can clear retrans_stamp when there are no retransmissions in the
+ * window. It would seem that it is trivially available for us in
+ * tp->retrans_out, however, that kind of assumptions doesn't consider
+ * what will happen if errors occur when sending retransmission for the
+ * second time. ...It could the that such segment has only
+ * TCPCB_EVER_RETRANS set at the present time. It seems that checking
+ * the head skb is enough except for some reneging corner cases that
+ * are not worth the effort.
+ *
+ * Main reason for all this complexity is the fact that connection dying
+ * time now depends on the validity of the retrans_stamp, in particular,
+ * that successive retransmissions of a segment must not advance
+ * retrans_stamp under any conditions.
+ */
+static int tcp_any_retrans_done(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (tp->retrans_out)
+		return 1;
+
+	skb = tcp_write_queue_head(sk);
+	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
+		return 1;
+
+	return 0;
+}
+
 /* Undo during fast recovery after partial ACK. */
 
 static int tcp_try_undo_partial(struct sock *sk, int acked)
@@ -2729,7 +2758,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
 		/* Plain luck! Hole if filled with delayed
 		 * packet, rather than with a retransmit.
 		 */
-		if (tp->retrans_out == 0)
+		if (!tcp_any_retrans_done(sk))
 			tp->retrans_stamp = 0;
 
 		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
@@ -2788,7 +2817,7 @@ static void tcp_try_keep_open(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	int state = TCP_CA_Open;
 
-	if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker)
+	if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
 		state = TCP_CA_Disorder;
 
 	if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2803,7 +2832,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
 
 	tcp_verify_left_out(tp);
 
-	if (!tp->frto_counter && tp->retrans_out == 0)
+	if (!tp->frto_counter && !tcp_any_retrans_done(sk))
 		tp->retrans_stamp = 0;
 
 	if (flag & FLAG_ECE)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 29002ab26e0..15e96030ce4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1464,7 +1464,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
-	__inet_hash_nolisten(newsk);
+	__inet_hash_nolisten(newsk, NULL);
 	__inet_inherit_port(sk, newsk);
 
 	return newsk;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 8353a538cd4..8816a20c259 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -132,6 +132,35 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
 	}
 }
 
+/* This function calculates a "timeout" which is equivalent to the timeout of a
+ * TCP connection after "boundary" unsucessful, exponentially backed-off
+ * retransmissions with an initial RTO of TCP_RTO_MIN.
+ */
+static bool retransmits_timed_out(struct sock *sk,
+				  unsigned int boundary)
+{
+	unsigned int timeout, linear_backoff_thresh;
+	unsigned int start_ts;
+
+	if (!inet_csk(sk)->icsk_retransmits)
+		return false;
+
+	if (unlikely(!tcp_sk(sk)->retrans_stamp))
+		start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when;
+	else
+		start_ts = tcp_sk(sk)->retrans_stamp;
+
+	linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
+
+	if (boundary <= linear_backoff_thresh)
+		timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
+	else
+		timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
+			  (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+
+	return (tcp_time_stamp - start_ts) >= timeout;
+}
+
 /* A write timeout has occurred. Process the after effects. */
 static int tcp_write_timeout(struct sock *sk)
 {