summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c363
1 files changed, 335 insertions, 28 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 64d0af67582..c8666b70cde 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,7 @@
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/crypto.h>
+#include <linux/time.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -326,6 +327,43 @@ void tcp_enter_memory_pressure(struct sock *sk)
EXPORT_SYMBOL(tcp_enter_memory_pressure);
+/* Convert seconds to retransmits based on initial and max timeout */
+static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
+{
+ u8 res = 0;
+
+ if (seconds > 0) {
+ int period = timeout;
+
+ res = 1;
+ while (seconds > period && res < 255) {
+ res++;
+ timeout <<= 1;
+ if (timeout > rto_max)
+ timeout = rto_max;
+ period += timeout;
+ }
+ }
+ return res;
+}
+
+/* Convert retransmits to seconds based on initial and max timeout */
+static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
+{
+ int period = 0;
+
+ if (retrans > 0) {
+ period = timeout;
+ while (--retrans) {
+ timeout <<= 1;
+ if (timeout > rto_max)
+ timeout = rto_max;
+ period += timeout;
+ }
+ }
+ return period;
+}
+
/*
* Wait for a TCP event.
*
@@ -1146,7 +1184,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
#if TCP_DEBUG
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
- WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
+ WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+ KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
#endif
if (inet_csk_ack_scheduled(sk)) {
@@ -1393,11 +1433,13 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* Now that we have two receive queues this
* shouldn't happen.
*/
- if (before(*seq, TCP_SKB_CB(skb)->seq)) {
- printk(KERN_INFO "recvmsg bug: copied %X "
- "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
+ KERN_INFO "recvmsg bug: copied %X "
+ "seq %X rcvnxt %X fl %X\n", *seq,
+ TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
+ flags))
break;
- }
+
offset = *seq - TCP_SKB_CB(skb)->seq;
if (tcp_hdr(skb)->syn)
offset--;
@@ -1405,7 +1447,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
goto found_ok_skb;
if (tcp_hdr(skb)->fin)
goto found_fin_ok;
- WARN_ON(!(flags & MSG_PEEK));
+ WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
+ "copied %X seq %X rcvnxt %X fl %X\n",
+ *seq, TCP_SKB_CB(skb)->seq,
+ tp->rcv_nxt, flags);
}
/* Well, if we have backlog, try to process it now yet. */
@@ -1998,7 +2043,7 @@ int tcp_disconnect(struct sock *sk, int flags)
__skb_queue_purge(&sk->sk_async_wait_queue);
#endif
- inet->dport = 0;
+ inet->inet_dport = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
@@ -2015,6 +2060,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
tp->bytes_acked = 0;
+ tp->window_clamp = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
@@ -2022,7 +2068,7 @@ int tcp_disconnect(struct sock *sk, int flags)
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
- WARN_ON(inet->num && !icsk->icsk_bind_hash);
+ WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
sk->sk_error_report(sk);
return err;
@@ -2039,8 +2085,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
int val;
int err = 0;
- /* This is a string value all the others are int's */
- if (optname == TCP_CONGESTION) {
+ /* These are data/string values, all the others are ints */
+ switch (optname) {
+ case TCP_CONGESTION: {
char name[TCP_CA_NAME_MAX];
if (optlen < 1)
@@ -2057,6 +2104,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
release_sock(sk);
return err;
}
+ case TCP_COOKIE_TRANSACTIONS: {
+ struct tcp_cookie_transactions ctd;
+ struct tcp_cookie_values *cvp = NULL;
+
+ if (sizeof(ctd) > optlen)
+ return -EINVAL;
+ if (copy_from_user(&ctd, optval, sizeof(ctd)))
+ return -EFAULT;
+
+ if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
+ ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
+ return -EINVAL;
+
+ if (ctd.tcpct_cookie_desired == 0) {
+ /* default to global value */
+ } else if ((0x1 & ctd.tcpct_cookie_desired) ||
+ ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
+ ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
+ return -EINVAL;
+ }
+
+ if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
+ /* Supercedes all other values */
+ lock_sock(sk);
+ if (tp->cookie_values != NULL) {
+ kref_put(&tp->cookie_values->kref,
+ tcp_cookie_values_release);
+ tp->cookie_values = NULL;
+ }
+ tp->rx_opt.cookie_in_always = 0; /* false */
+ tp->rx_opt.cookie_out_never = 1; /* true */
+ release_sock(sk);
+ return err;
+ }
+
+ /* Allocate ancillary memory before locking.
+ */
+ if (ctd.tcpct_used > 0 ||
+ (tp->cookie_values == NULL &&
+ (sysctl_tcp_cookie_size > 0 ||
+ ctd.tcpct_cookie_desired > 0 ||
+ ctd.tcpct_s_data_desired > 0))) {
+ cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
+ GFP_KERNEL);
+ if (cvp == NULL)
+ return -ENOMEM;
+ }
+ lock_sock(sk);
+ tp->rx_opt.cookie_in_always =
+ (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
+ tp->rx_opt.cookie_out_never = 0; /* false */
+
+ if (tp->cookie_values != NULL) {
+ if (cvp != NULL) {
+ /* Changed values are recorded by a changed
+ * pointer, ensuring the cookie will differ,
+ * without separately hashing each value later.
+ */
+ kref_put(&tp->cookie_values->kref,
+ tcp_cookie_values_release);
+ kref_init(&cvp->kref);
+ tp->cookie_values = cvp;
+ } else {
+ cvp = tp->cookie_values;
+ }
+ }
+ if (cvp != NULL) {
+ cvp->cookie_desired = ctd.tcpct_cookie_desired;
+
+ if (ctd.tcpct_used > 0) {
+ memcpy(cvp->s_data_payload, ctd.tcpct_value,
+ ctd.tcpct_used);
+ cvp->s_data_desired = ctd.tcpct_used;
+ cvp->s_data_constant = 1; /* true */
+ } else {
+ /* No constant payload data. */
+ cvp->s_data_desired = ctd.tcpct_s_data_desired;
+ cvp->s_data_constant = 0; /* false */
+ }
+ }
+ release_sock(sk);
+ return err;
+ }
+ default:
+ /* fallthru */
+ break;
+ };
if (optlen < sizeof(int))
return -EINVAL;
@@ -2163,16 +2297,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
break;
case TCP_DEFER_ACCEPT:
- icsk->icsk_accept_queue.rskq_defer_accept = 0;
- if (val > 0) {
- /* Translate value in seconds to number of
- * retransmits */
- while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
- val > ((TCP_TIMEOUT_INIT / HZ) <<
- icsk->icsk_accept_queue.rskq_defer_accept))
- icsk->icsk_accept_queue.rskq_defer_accept++;
- icsk->icsk_accept_queue.rskq_defer_accept++;
- }
+ /* Translate value in seconds to number of retransmits */
+ icsk->icsk_accept_queue.rskq_defer_accept =
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ);
break;
case TCP_WINDOW_CLAMP:
@@ -2353,8 +2481,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = (val ? : sysctl_tcp_fin_timeout) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
- ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
+ val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
+ TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
@@ -2387,6 +2515,47 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
return -EFAULT;
return 0;
+
+ case TCP_COOKIE_TRANSACTIONS: {
+ struct tcp_cookie_transactions ctd;
+ struct tcp_cookie_values *cvp = tp->cookie_values;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < sizeof(ctd))
+ return -EINVAL;
+
+ memset(&ctd, 0, sizeof(ctd));
+ ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
+ TCP_COOKIE_IN_ALWAYS : 0)
+ | (tp->rx_opt.cookie_out_never ?
+ TCP_COOKIE_OUT_NEVER : 0);
+
+ if (cvp != NULL) {
+ ctd.tcpct_flags |= (cvp->s_data_in ?
+ TCP_S_DATA_IN : 0)
+ | (cvp->s_data_out ?
+ TCP_S_DATA_OUT : 0);
+
+ ctd.tcpct_cookie_desired = cvp->cookie_desired;
+ ctd.tcpct_s_data_desired = cvp->s_data_desired;
+
+ /* Cookie(s) saved, return as nonce */
+ if (sizeof(ctd.tcpct_value) < cvp->cookie_pair_size) {
+ /* impossible? */
+ return -EINVAL;
+ }
+ memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
+ cvp->cookie_pair_size);
+ ctd.tcpct_used = cvp->cookie_pair_size;
+ }
+
+ if (put_user(sizeof(ctd), optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &ctd, sizeof(ctd)))
+ return -EFAULT;
+ return 0;
+ }
default:
return -ENOPROTOOPT;
}
@@ -2809,6 +2978,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
+/**
+ * Each Responder maintains up to two secret values concurrently for
+ * efficient secret rollover. Each secret value has 4 states:
+ *
+ * Generating. (tcp_secret_generating != tcp_secret_primary)
+ * Generates new Responder-Cookies, but not yet used for primary
+ * verification. This is a short-term state, typically lasting only
+ * one round trip time (RTT).
+ *
+ * Primary. (tcp_secret_generating == tcp_secret_primary)
+ * Used both for generation and primary verification.
+ *
+ * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
+ * Used for verification, until the first failure that can be
+ * verified by the newer Generating secret. At that time, this
+ * cookie's state is changed to Secondary, and the Generating
+ * cookie's state is changed to Primary. This is a short-term state,
+ * typically lasting only one round trip time (RTT).
+ *
+ * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
+ * Used for secondary verification, after primary verification
+ * failures. This state lasts no more than twice the Maximum Segment
+ * Lifetime (2MSL). Then, the secret is discarded.
+ */
+struct tcp_cookie_secret {
+ /* The secret is divided into two parts. The digest part is the
+ * equivalent of previously hashing a secret and saving the state,
+ * and serves as an initialization vector (IV). The message part
+ * serves as the trailing secret.
+ */
+ u32 secrets[COOKIE_WORKSPACE_WORDS];
+ unsigned long expires;
+};
+
+#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
+#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
+#define TCP_SECRET_LIFE (HZ * 600)
+
+static struct tcp_cookie_secret tcp_secret_one;
+static struct tcp_cookie_secret tcp_secret_two;
+
+/* Essentially a circular list, without dynamic allocation. */
+static struct tcp_cookie_secret *tcp_secret_generating;
+static struct tcp_cookie_secret *tcp_secret_primary;
+static struct tcp_cookie_secret *tcp_secret_retiring;
+static struct tcp_cookie_secret *tcp_secret_secondary;
+
+static DEFINE_SPINLOCK(tcp_secret_locker);
+
+/* Select a pseudo-random word in the cookie workspace.
+ */
+static inline u32 tcp_cookie_work(const u32 *ws, const int n)
+{
+ return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
+}
+
+/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
+ * Called in softirq context.
+ * Returns: 0 for success.
+ */
+int tcp_cookie_generator(u32 *bakery)
+{
+ unsigned long jiffy = jiffies;
+
+ if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
+ spin_lock_bh(&tcp_secret_locker);
+ if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
+ /* refreshed by another */
+ memcpy(bakery,
+ &tcp_secret_generating->secrets[0],
+ COOKIE_WORKSPACE_WORDS);
+ } else {
+ /* still needs refreshing */
+ get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
+
+ /* The first time, paranoia assumes that the
+ * randomization function isn't as strong. But,
+ * this secret initialization is delayed until
+ * the last possible moment (packet arrival).
+ * Although that time is observable, it is
+ * unpredictably variable. Mash in the most
+ * volatile clock bits available, and expire the
+ * secret extra quickly.
+ */
+ if (unlikely(tcp_secret_primary->expires ==
+ tcp_secret_secondary->expires)) {
+ struct timespec tv;
+
+ getnstimeofday(&tv);
+ bakery[COOKIE_DIGEST_WORDS+0] ^=
+ (u32)tv.tv_nsec;
+
+ tcp_secret_secondary->expires = jiffy
+ + TCP_SECRET_1MSL
+ + (0x0f & tcp_cookie_work(bakery, 0));
+ } else {
+ tcp_secret_secondary->expires = jiffy
+ + TCP_SECRET_LIFE
+ + (0xff & tcp_cookie_work(bakery, 1));
+ tcp_secret_primary->expires = jiffy
+ + TCP_SECRET_2MSL
+ + (0x1f & tcp_cookie_work(bakery, 2));
+ }
+ memcpy(&tcp_secret_secondary->secrets[0],
+ bakery, COOKIE_WORKSPACE_WORDS);
+
+ rcu_assign_pointer(tcp_secret_generating,
+ tcp_secret_secondary);
+ rcu_assign_pointer(tcp_secret_retiring,
+ tcp_secret_primary);
+ /*
+ * Neither call_rcu() nor synchronize_rcu() needed.
+ * Retiring data is not freed. It is replaced after
+ * further (locked) pointer updates, and a quiet time
+ * (minimum 1MSL, maximum LIFE - 2MSL).
+ */
+ }
+ spin_unlock_bh(&tcp_secret_locker);
+ } else {
+ rcu_read_lock_bh();
+ memcpy(bakery,
+ &rcu_dereference(tcp_secret_generating)->secrets[0],
+ COOKIE_WORKSPACE_WORDS);
+ rcu_read_unlock_bh();
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcp_cookie_generator);
+
void tcp_done(struct sock *sk)
{
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -2843,6 +3141,7 @@ void __init tcp_init(void)
struct sk_buff *skb = NULL;
unsigned long nr_pages, limit;
int order, i, max_share;
+ unsigned long jiffy = jiffies;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -2865,11 +3164,10 @@ void __init tcp_init(void)
(totalram_pages >= 128 * 1024) ?
13 : 15,
0,
- &tcp_hashinfo.ehash_size,
NULL,
+ &tcp_hashinfo.ehash_mask,
thash_entries ? 0 : 512 * 1024);
- tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
- for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
+ for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
}
@@ -2878,7 +3176,7 @@ void __init tcp_init(void)
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket),
- tcp_hashinfo.ehash_size,
+ tcp_hashinfo.ehash_mask + 1,
(totalram_pages >= 128 * 1024) ?
13 : 15,
0,
@@ -2933,10 +3231,19 @@ void __init tcp_init(void)
sysctl_tcp_rmem[2] = max(87380, max_share);
printk(KERN_INFO "TCP: Hash tables configured "
- "(established %d bind %d)\n",
- tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
+ "(established %u bind %u)\n",
+ tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
tcp_register_congestion_control(&tcp_reno);
+
+ memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
+ memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
+ tcp_secret_one.expires = jiffy; /* past due */
+ tcp_secret_two.expires = jiffy; /* past due */
+ tcp_secret_generating = &tcp_secret_one;
+ tcp_secret_primary = &tcp_secret_one;
+ tcp_secret_retiring = &tcp_secret_two;
+ tcp_secret_secondary = &tcp_secret_two;
}
EXPORT_SYMBOL(tcp_close);