summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c44
-rw-r--r--net/ipv4/icmp.c9
-rw-r--r--net/ipv4/inet_connection_sock.c8
-rw-r--r--net/ipv4/inetpeer.c34
-rw-r--r--net/ipv4/ip_input.c48
-rw-r--r--net/ipv4/protocol.c8
-rw-r--r--net/ipv4/route.c5
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp_input.c16
-rw-r--r--net/ipv4/tcp_ipv4.c64
-rw-r--r--net/ipv4/tcp_minisocks.c2
11 files changed, 169 insertions, 76 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4e8e00a2c9..07a02f6e969 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+ dst_release(sk->sk_rx_dst);
sk_refcnt_debug_dec(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -242,20 +243,18 @@ void build_ehash_secret(void)
}
EXPORT_SYMBOL(build_ehash_secret);
-static inline int inet_netns_ok(struct net *net, int protocol)
+static inline int inet_netns_ok(struct net *net, __u8 protocol)
{
- int hash;
const struct net_protocol *ipprot;
if (net_eq(net, &init_net))
return 1;
- hash = protocol & (MAX_INET_PROTOS - 1);
- ipprot = rcu_dereference(inet_protos[hash]);
-
- if (ipprot == NULL)
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot == NULL) {
/* raw IP is OK */
return 1;
+ }
return ipprot->netns_ok;
}
@@ -1216,8 +1215,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
static int inet_gso_send_check(struct sk_buff *skb)
{
- const struct iphdr *iph;
const struct net_protocol *ops;
+ const struct iphdr *iph;
int proto;
int ihl;
int err = -EINVAL;
@@ -1236,7 +1235,7 @@ static int inet_gso_send_check(struct sk_buff *skb)
__skb_pull(skb, ihl);
skb_reset_transport_header(skb);
iph = ip_hdr(skb);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ proto = iph->protocol;
err = -EPROTONOSUPPORT;
rcu_read_lock();
@@ -1253,8 +1252,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
- struct iphdr *iph;
const struct net_protocol *ops;
+ struct iphdr *iph;
int proto;
int ihl;
int id;
@@ -1286,7 +1285,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
skb_reset_transport_header(skb);
iph = ip_hdr(skb);
id = ntohs(iph->id);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ proto = iph->protocol;
segs = ERR_PTR(-EPROTONOSUPPORT);
rcu_read_lock();
@@ -1340,7 +1339,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
goto out;
}
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ proto = iph->protocol;
rcu_read_lock();
ops = rcu_dereference(inet_protos[proto]);
@@ -1398,11 +1397,11 @@ out:
static int inet_gro_complete(struct sk_buff *skb)
{
- const struct net_protocol *ops;
+ __be16 newlen = htons(skb->len - skb_network_offset(skb));
struct iphdr *iph = ip_hdr(skb);
- int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ const struct net_protocol *ops;
+ int proto = iph->protocol;
int err = -ENOSYS;
- __be16 newlen = htons(skb->len - skb_network_offset(skb));
csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
@@ -1520,14 +1519,15 @@ static const struct net_protocol igmp_protocol = {
#endif
static const struct net_protocol tcp_protocol = {
- .handler = tcp_v4_rcv,
- .err_handler = tcp_v4_err,
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
- .gro_receive = tcp4_gro_receive,
- .gro_complete = tcp4_gro_complete,
- .no_policy = 1,
- .netns_ok = 1,
+ .early_demux = tcp_v4_early_demux,
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .gso_send_check = tcp_v4_gso_send_check,
+ .gso_segment = tcp_tso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ .no_policy = 1,
+ .netns_ok = 1,
};
static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e1caa1abe5d..49a74cc79dc 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -637,12 +637,12 @@ EXPORT_SYMBOL(icmp_send);
static void icmp_unreach(struct sk_buff *skb)
{
+ const struct net_protocol *ipprot;
const struct iphdr *iph;
struct icmphdr *icmph;
- int hash, protocol;
- const struct net_protocol *ipprot;
- u32 info = 0;
struct net *net;
+ u32 info = 0;
+ int protocol;
net = dev_net(skb_dst(skb)->dev);
@@ -731,9 +731,8 @@ static void icmp_unreach(struct sk_buff *skb)
*/
raw_icmp_error(skb, protocol, info);
- hash = protocol & (MAX_INET_PROTOS - 1);
rcu_read_lock();
- ipprot = rcu_dereference(inet_protos[hash]);
+ ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->err_handler)
ipprot->err_handler(skb, info);
rcu_read_unlock();
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f9ee7417f6a..034ddbe42ad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -368,17 +368,21 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
struct dst_entry *inet_csk_route_req(struct sock *sk,
struct flowi4 *fl4,
- const struct request_sock *req)
+ const struct request_sock *req,
+ bool nocache)
{
struct rtable *rt;
const struct inet_request_sock *ireq = inet_rsk(req);
struct ip_options_rcu *opt = inet_rsk(req)->opt;
struct net *net = sock_net(sk);
+ int flags = inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS;
+ if (nocache)
+ flags |= FLOWI_FLAG_RT_NOCACHE;
flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol,
- inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS,
+ flags,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index cac02ad1425..da90a8cab61 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -126,7 +126,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
static void inetpeer_gc_worker(struct work_struct *work)
{
- struct inet_peer *p, *n;
+ struct inet_peer *p, *n, *c;
LIST_HEAD(list);
spin_lock_bh(&gc_lock);
@@ -138,17 +138,19 @@ static void inetpeer_gc_worker(struct work_struct *work)
list_for_each_entry_safe(p, n, &list, gc_list) {
- if(need_resched())
+ if (need_resched())
cond_resched();
- if (p->avl_left != peer_avl_empty) {
- list_add_tail(&p->avl_left->gc_list, &list);
- p->avl_left = peer_avl_empty;
+ c = rcu_dereference_protected(p->avl_left, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_left = peer_avl_empty_rcu;
}
- if (p->avl_right != peer_avl_empty) {
- list_add_tail(&p->avl_right->gc_list, &list);
- p->avl_right = peer_avl_empty;
+ c = rcu_dereference_protected(p->avl_right, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_right = peer_avl_empty_rcu;
}
n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
@@ -587,23 +589,17 @@ static void inetpeer_inval_rcu(struct rcu_head *head)
void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
- struct inet_peer *old, *new, *prev;
+ struct inet_peer *root;
write_seqlock_bh(&base->lock);
- old = base->root;
- if (old == peer_avl_empty_rcu)
- goto out;
-
- new = peer_avl_empty_rcu;
-
- prev = cmpxchg(&base->root, old, new);
- if (prev == old) {
+ root = rcu_deref_locked(base->root, base);
+ if (root != peer_avl_empty) {
+ base->root = peer_avl_empty_rcu;
base->total = 0;
- call_rcu(&prev->gc_rcu, inetpeer_inval_rcu);
+ call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
}
-out:
write_sequnlock_bh(&base->lock);
}
EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144ca33..bca25179cdb 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
- int hash, raw;
const struct net_protocol *ipprot;
+ int raw;
resubmit:
raw = raw_local_deliver(skb, protocol);
- hash = protocol & (MAX_INET_PROTOS - 1);
- ipprot = rcu_dereference(inet_protos[hash]);
+ ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot != NULL) {
int ret;
@@ -314,6 +313,8 @@ drop:
return true;
}
+int sysctl_ip_early_demux __read_mostly = 1;
+
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
@@ -324,19 +325,34 @@ static int ip_rcv_finish(struct sk_buff *skb)
* how the packet travels inside Linux networking.
*/
if (skb_dst(skb) == NULL) {
- int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev);
- if (unlikely(err)) {
- if (err == -EHOSTUNREACH)
- IP_INC_STATS_BH(dev_net(skb->dev),
- IPSTATS_MIB_INADDRERRORS);
- else if (err == -ENETUNREACH)
- IP_INC_STATS_BH(dev_net(skb->dev),
- IPSTATS_MIB_INNOROUTES);
- else if (err == -EXDEV)
- NET_INC_STATS_BH(dev_net(skb->dev),
- LINUX_MIB_IPRPFILTER);
- goto drop;
+ int err = -ENOENT;
+
+ if (sysctl_ip_early_demux) {
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ rcu_read_lock();
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->early_demux)
+ err = ipprot->early_demux(skb);
+ rcu_read_unlock();
+ }
+
+ if (err) {
+ err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev);
+ if (unlikely(err)) {
+ if (err == -EHOSTUNREACH)
+ IP_INC_STATS_BH(dev_net(skb->dev),
+ IPSTATS_MIB_INADDRERRORS);
+ else if (err == -ENETUNREACH)
+ IP_INC_STATS_BH(dev_net(skb->dev),
+ IPSTATS_MIB_INNOROUTES);
+ else if (err == -EXDEV)
+ NET_INC_STATS_BH(dev_net(skb->dev),
+ LINUX_MIB_IPRPFILTER);
+ goto drop;
+ }
}
}
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 9ae5c01cd0b..8918eff1426 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int hash = protocol & (MAX_INET_PROTOS - 1);
-
- return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+ return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_protocol);
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol);
int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+ int ret;
- ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+ ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
prot, NULL) == prot) ? 0 : -1;
synchronize_net();
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a91f6d33804..8d62d85e68d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1156,7 +1156,7 @@ restart:
candp = NULL;
now = jiffies;
- if (!rt_caching(dev_net(rt->dst.dev))) {
+ if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
/*
* If we're not caching, just tell the caller we
* were successful and don't touch the route. The
@@ -2582,6 +2582,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rt_set_nexthop(rth, fl4, res, fi, type, 0);
+ if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
+ rth->dst.flags |= DST_NOCACHE;
+
return rth;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ef32956ed65..12aa0c5867c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -301,6 +301,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "ip_early_demux",
+ .data = &sysctl_ip_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "ip_dynaddr",
.data = &sysctl_ip_dynaddr,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8bce8..8416f8a68e6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
int res;
+ if (sk->sk_rx_dst) {
+ struct dst_entry *dst = sk->sk_rx_dst;
+ if (unlikely(dst->obsolete)) {
+ if (dst->ops->check(dst, 0) == NULL) {
+ dst_release(dst);
+ sk->sk_rx_dst = NULL;
+ }
+ }
+ }
+ if (unlikely(sk->sk_rx_dst == NULL))
+ sk->sk_rx_dst = dst_clone(skb_dst(skb));
+
/*
* Header prediction.
* The code loosely follows the one in the famous
@@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
tcp_set_state(sk, TCP_ESTABLISHED);
- if (skb != NULL)
+ if (skb != NULL) {
+ sk->sk_rx_dst = dst_clone(skb_dst(skb));
security_inet_conn_established(sk, skb);
+ }
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda2ca17135..b52934f5334 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -825,7 +825,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct request_values *rvp,
- u16 queue_mapping)
+ u16 queue_mapping,
+ bool nocache)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -833,7 +834,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct sk_buff * skb;
/* First, grab a route. */
- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
return -1;
skb = tcp_make_synack(sk, dst, req, rvp);
@@ -855,7 +856,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
struct request_values *rvp)
{
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
- return tcp_v4_send_synack(sk, NULL, req, rvp, 0);
+ return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
}
/*
@@ -1388,7 +1389,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
*/
if (tmp_opt.saw_tstamp &&
tcp_death_row.sysctl_tw_recycle &&
- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
+ (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
fl4.daddr == saddr &&
(peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
inet_peer_refcheck(peer);
@@ -1424,7 +1425,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (tcp_v4_send_synack(sk, dst, req,
(struct request_values *)&tmp_ext,
- skb_get_queue_mapping(skb)) ||
+ skb_get_queue_mapping(skb),
+ want_cookie) ||
want_cookie)
goto drop_and_free;
@@ -1671,6 +1673,58 @@ csum_err:
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct net_device *dev;
+ struct sock *sk;
+ int err;
+
+ err = -ENOENT;
+ if (skb->pkt_type != PACKET_HOST)
+ goto out_err;
+
+ if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+ goto out_err;
+
+ iph = ip_hdr(skb);
+ th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ goto out_err;
+
+ if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+ goto out_err;
+
+ dev = skb->dev;
+ sk = __inet_lookup_established(net, &tcp_hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, th->dest,
+ dev->ifindex);
+ if (sk) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ struct dst_entry *dst = sk->sk_rx_dst;
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst) {
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (rt->rt_iif == dev->ifindex) {
+ skb_dst_set_noref(skb, dst);
+ err = 0;
+ }
+ }
+ }
+ }
+
+out_err:
+ return err;
+}
+
/*
* From tcp_input.c
*/
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb015317c9f..72b7c63b1a3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
struct tcp_sock *oldtp = tcp_sk(sk);
struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
+ newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+
/* TCP Cookie Transactions require space for the cookie pair,
* as it differs for each connection. There is no need to
* copy any s_data_payload stored at the original socket.