From f1da70632fa0875f80fc60991a010c31f40983ff Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 2 Oct 2006 16:10:47 -0700 Subject: [NETFILTER]: Kconfig: fix xt_physdev dependencies xt_physdev depends on bridge netfilter, which is a boolean, but can still be built modular because of special handling in the bridge makefile. Add a dependency on BRIDGE to prevent XT_MATCH_PHYSDEV=y, BRIDGE=m. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0a28d2c5c44..ce94732b8e2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -365,7 +365,7 @@ config NETFILTER_XT_MATCH_MULTIPORT config NETFILTER_XT_MATCH_PHYSDEV tristate '"physdev" match support' - depends on NETFILTER_XTABLES && BRIDGE_NETFILTER + depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER help Physdev packet matching matches against the physical bridge ports the IP packet arrived on or will leave by. -- cgit v1.2.3-70-g09d2 From b4c4ed175ff0ee816df48571cfa9b73f521964b6 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 2 Oct 2006 16:11:13 -0700 Subject: [NETFILTER]: add type parameter to ip_route_me_harder By adding a type parameter to ip_route_me_harder() the expensive call to inet_addr_type() can be avoided in some cases. A followup patch where ip_route_me_harder() is called from within ip_vs_out() is one such example. Signed-off-By: Simon Horman Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4.h | 2 +- net/ipv4/netfilter.c | 9 ++++++--- net/ipv4/netfilter/ip_nat_standalone.c | 3 ++- net/ipv4/netfilter/iptable_mangle.c | 3 ++- 4 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index ce02c984f3b..5b63a231a76 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -77,7 +77,7 @@ enum nf_ip_hook_priorities { #define SO_ORIGINAL_DST 80 #ifdef __KERNEL__ -extern int ip_route_me_harder(struct sk_buff **pskb); +extern int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type); extern int ip_xfrm_me_harder(struct sk_buff **pskb); extern unsigned int nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 5ac15379a0c..e2005c6810a 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -8,7 +8,7 @@ #include /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff **pskb) +int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type) { struct iphdr *iph = (*pskb)->nh.iph; struct rtable *rt; @@ -16,10 +16,13 @@ int ip_route_me_harder(struct sk_buff **pskb) struct dst_entry *odst; unsigned int hh_len; + if (addr_type == RTN_UNSPEC) + addr_type = inet_addr_type(iph->saddr); + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. */ - if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + if (addr_type == RTN_LOCAL) { fl.nl_u.ip4_u.daddr = iph->daddr; fl.nl_u.ip4_u.saddr = iph->saddr; fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); @@ -156,7 +159,7 @@ static int nf_ip_reroute(struct sk_buff **pskb, const struct nf_info *info) if (!(iph->tos == rt_info->tos && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(pskb); + return ip_route_me_harder(pskb, RTN_UNSPEC); } return 0; } diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 021395b6746..d85d2de5044 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -265,7 +265,8 @@ ip_nat_local_fn(unsigned int hooknum, ct->tuplehash[!dir].tuple.src.u.all #endif ) - return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + if (ip_route_me_harder(pskb, RTN_UNSPEC)) + ret = NF_DROP; } return ret; } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index e62ea2bb9c0..b91f3582359 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -157,7 +157,8 @@ ipt_local_hook(unsigned int hook, || (*pskb)->nfmark != nfmark #endif || (*pskb)->nh.iph->tos != tos)) - return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + if (ip_route_me_harder(pskb, RTN_UNSPEC)) + ret = NF_DROP; return ret; } -- cgit v1.2.3-70-g09d2 From 901eaf6c8f997f18ebc8fcbb85411c79161ab3b2 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 2 Oct 2006 16:11:51 -0700 Subject: [NETFILTER]: Honour source routing for LVS-NAT For policy routing, packets originating from this machine itself may be routed differently to packets passing through. We want this packet to be routed as if it came from this machine itself. So re-compute the routing information using ip_route_me_harder(). This patch is derived from work by Ken Brownfield Cc: Ken Brownfield Signed-off-by: Simon Horman Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 6dee03935f7..1445bb47fea 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -813,6 +813,16 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, skb->nh.iph->saddr = cp->vaddr; ip_send_check(skb->nh.iph); + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ + if (ip_route_me_harder(pskb, RTN_LOCAL) != 0) + goto drop; + skb = *pskb; + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); -- cgit v1.2.3-70-g09d2 From 9d02002d2dc2c7423e5891b97727fde4d667adf1 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 2 Oct 2006 16:12:20 -0700 Subject: [NETFILTER]: ipt_REJECT: remove largely duplicate route_reverse function Use ip_route_me_harder instead, which now allows to specify how we wish the packet to be routed. Based on patch by Simon Horman . Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_REJECT.c | 97 ++++++++--------------------------------- 1 file changed, 19 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index fd0c05efed8..ad0312d0e4f 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -38,76 +38,16 @@ MODULE_DESCRIPTION("iptables REJECT target module"); #define DEBUGP(format, args...) #endif -static inline struct rtable *route_reverse(struct sk_buff *skb, - struct tcphdr *tcph, int hook) -{ - struct iphdr *iph = skb->nh.iph; - struct dst_entry *odst; - struct flowi fl = {}; - struct rtable *rt; - - /* We don't require ip forwarding to be enabled to be able to - * send a RST reply for bridged traffic. */ - if (hook != NF_IP_FORWARD -#ifdef CONFIG_BRIDGE_NETFILTER - || (skb->nf_bridge && skb->nf_bridge->mask & BRNF_BRIDGED) -#endif - ) { - fl.nl_u.ip4_u.daddr = iph->saddr; - if (hook == NF_IP_LOCAL_IN) - fl.nl_u.ip4_u.saddr = iph->daddr; - fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); - - if (ip_route_output_key(&rt, &fl) != 0) - return NULL; - } else { - /* non-local src, find valid iif to satisfy - * rp-filter when calling ip_route_input. */ - fl.nl_u.ip4_u.daddr = iph->daddr; - if (ip_route_output_key(&rt, &fl) != 0) - return NULL; - - odst = skb->dst; - if (ip_route_input(skb, iph->saddr, iph->daddr, - RT_TOS(iph->tos), rt->u.dst.dev) != 0) { - dst_release(&rt->u.dst); - return NULL; - } - dst_release(&rt->u.dst); - rt = (struct rtable *)skb->dst; - skb->dst = odst; - - fl.nl_u.ip4_u.daddr = iph->saddr; - fl.nl_u.ip4_u.saddr = iph->daddr; - fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); - } - - if (rt->u.dst.error) { - dst_release(&rt->u.dst); - return NULL; - } - - fl.proto = IPPROTO_TCP; - fl.fl_ip_sport = tcph->dest; - fl.fl_ip_dport = tcph->source; - security_skb_classify_flow(skb, &fl); - - xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); - - return rt; -} - /* Send RST reply */ static void send_reset(struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; struct iphdr *iph = oldskb->nh.iph; struct tcphdr _otcph, *oth, *tcph; - struct rtable *rt; __be16 tmp_port; __be32 tmp_addr; int needs_ack; - int hh_len; + unsigned int addr_type; /* IP header checks: fragment. */ if (oldskb->nh.iph->frag_off & htons(IP_OFFSET)) @@ -126,23 +66,13 @@ static void send_reset(struct sk_buff *oldskb, int hook) if (nf_ip_checksum(oldskb, hook, iph->ihl * 4, IPPROTO_TCP)) return; - if ((rt = route_reverse(oldskb, oth, hook)) == NULL) - return; - - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); - /* We need a linear, writeable skb. We also need to expand headroom in case hh_len of incoming interface < hh_len of outgoing interface */ - nskb = skb_copy_expand(oldskb, hh_len, skb_tailroom(oldskb), + nskb = skb_copy_expand(oldskb, LL_MAX_HEADER, skb_tailroom(oldskb), GFP_ATOMIC); - if (!nskb) { - dst_release(&rt->u.dst); + if (!nskb) return; - } - - dst_release(nskb->dst); - nskb->dst = &rt->u.dst; /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); @@ -184,6 +114,21 @@ static void send_reset(struct sk_buff *oldskb, int hook) tcph->window = 0; tcph->urg_ptr = 0; + /* Set DF, id = 0 */ + nskb->nh.iph->frag_off = htons(IP_DF); + nskb->nh.iph->id = 0; + + addr_type = RTN_UNSPEC; + if (hook != NF_IP_FORWARD +#ifdef CONFIG_BRIDGE_NETFILTER + || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED) +#endif + ) + addr_type = RTN_LOCAL; + + if (ip_route_me_harder(&nskb, addr_type)) + goto free_nskb; + /* Adjust TCP checksum */ nskb->ip_summed = CHECKSUM_NONE; tcph->check = 0; @@ -192,12 +137,8 @@ static void send_reset(struct sk_buff *oldskb, int hook) nskb->nh.iph->daddr, csum_partial((char *)tcph, sizeof(struct tcphdr), 0)); - - /* Adjust IP TTL, DF */ + /* Adjust IP TTL */ nskb->nh.iph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); - /* Set DF, id = 0 */ - nskb->nh.iph->frag_off = htons(IP_DF); - nskb->nh.iph->id = 0; /* Adjust IP checksum */ nskb->nh.iph->check = 0; -- cgit v1.2.3-70-g09d2 From b18dfa90c008850e0f3bfd63638dd8fbe8e08701 Mon Sep 17 00:00:00 2001 From: Bart De Schuymer Date: Mon, 2 Oct 2006 16:12:52 -0700 Subject: [NETFILTER]: ebt_mark: add or/and/xor action support to mark target The following patch adds or/and/xor functionality for the mark target, while staying backwards compatible. Signed-off-by: Bart De Schuymer Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_bridge/ebt_mark_t.h | 12 ++++++++++++ net/bridge/netfilter/ebt_mark.c | 21 +++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_bridge/ebt_mark_t.h b/include/linux/netfilter_bridge/ebt_mark_t.h index 110fec6a40a..6270f6f3369 100644 --- a/include/linux/netfilter_bridge/ebt_mark_t.h +++ b/include/linux/netfilter_bridge/ebt_mark_t.h @@ -1,6 +1,18 @@ #ifndef __LINUX_BRIDGE_EBT_MARK_T_H #define __LINUX_BRIDGE_EBT_MARK_T_H +/* The target member is reused for adding new actions, the + * value of the real target is -1 to -NUM_STANDARD_TARGETS. + * For backward compatibility, the 4 lsb (2 would be enough, + * but let's play it safe) are kept to designate this target. + * The remaining bits designate the action. By making the set + * action 0xfffffff0, the result will look ok for older + * versions. [September 2006] */ +#define MARK_SET_VALUE (0xfffffff0) +#define MARK_OR_VALUE (0xffffffe0) +#define MARK_AND_VALUE (0xffffffd0) +#define MARK_XOR_VALUE (0xffffffc0) + struct ebt_mark_t_info { unsigned long mark; diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c index 770c0df972a..b54306a934e 100644 --- a/net/bridge/netfilter/ebt_mark.c +++ b/net/bridge/netfilter/ebt_mark.c @@ -22,24 +22,37 @@ static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr, const void *data, unsigned int datalen) { struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; + int action = info->target & -16; - if ((*pskb)->nfmark != info->mark) + if (action == MARK_SET_VALUE) (*pskb)->nfmark = info->mark; + else if (action == MARK_OR_VALUE) + (*pskb)->nfmark |= info->mark; + else if (action == MARK_AND_VALUE) + (*pskb)->nfmark &= info->mark; + else + (*pskb)->nfmark ^= info->mark; - return info->target; + return info->target | -16; } static int ebt_target_mark_check(const char *tablename, unsigned int hookmask, const struct ebt_entry *e, void *data, unsigned int datalen) { struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; + int tmp; if (datalen != EBT_ALIGN(sizeof(struct ebt_mark_t_info))) return -EINVAL; - if (BASE_CHAIN && info->target == EBT_RETURN) + tmp = info->target | -16; + if (BASE_CHAIN && tmp == EBT_RETURN) return -EINVAL; CLEAR_BASE_CHAIN_BIT; - if (INVALID_TARGET) + if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0) + return -EINVAL; + tmp = info->target & -16; + if (tmp != MARK_SET_VALUE && tmp != MARK_OR_VALUE && + tmp != MARK_AND_VALUE && tmp != MARK_XOR_VALUE) return -EINVAL; return 0; } -- cgit v1.2.3-70-g09d2 From 81771b3b20fb4e98c6f2b2aac2bc10ed41a8f006 Mon Sep 17 00:00:00 2001 From: Ismail Donmez Date: Tue, 3 Oct 2006 13:49:10 -0700 Subject: [NET_SCHED]: Revert "HTB: fix incorrect use of RB_EMPTY_NODE" With commit 10fd48f2376db52f08bf0420d2c4f580e39269e1 [1] , RB_EMPTY_NODE changed behaviour so it returns true when the node is empty as expected. Hence Patrick McHardy's fix for sched_htb.c should be reverted. Signed-off-by: Ismail Donmez ACKed-by: Patrick McHardy Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 6c058e3660c..bb3ddd4784b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -391,7 +391,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q, /* If this triggers, it is a bug in this code, but it need not be fatal */ static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root) { - if (!RB_EMPTY_NODE(rb)) { + if (RB_EMPTY_NODE(rb)) { WARN_ON(1); } else { rb_erase(rb, root); -- cgit v1.2.3-70-g09d2 From 132a55f3c5c0b1a364d32f65595ad8838c30a60e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 3 Oct 2006 14:34:00 -0700 Subject: [UDP6]: Fix flowi clobbering The udp6_sendmsg function uses a shared buffer to store the flow without taking any locks. This leads to races with SMP. This patch moves the flowi object onto the stack. Signed-off-by: Herbert Xu Acked-by: James Morris Signed-off-by: David S. Miller --- net/ipv6/udp.c | 62 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9662561701d..552ec0f449a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -546,7 +546,7 @@ static int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct in6_addr *daddr, *final_p = NULL, final; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; - struct flowi *fl = &inet->cork.fl; + struct flowi fl; struct dst_entry *dst; int addr_len = msg->msg_namelen; int ulen = len; @@ -626,19 +626,19 @@ do_udp_sendmsg: } ulen += sizeof(struct udphdr); - memset(fl, 0, sizeof(*fl)); + memset(&fl, 0, sizeof(fl)); if (sin6) { if (sin6->sin6_port == 0) return -EINVAL; - fl->fl_ip_dport = sin6->sin6_port; + fl.fl_ip_dport = sin6->sin6_port; daddr = &sin6->sin6_addr; if (np->sndflow) { - fl->fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); + fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); if (flowlabel == NULL) return -EINVAL; daddr = &flowlabel->dst; @@ -656,32 +656,32 @@ do_udp_sendmsg: if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) - fl->oif = sin6->sin6_scope_id; + fl.oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - fl->fl_ip_dport = inet->dport; + fl.fl_ip_dport = inet->dport; daddr = &np->daddr; - fl->fl6_flowlabel = np->flow_label; + fl.fl6_flowlabel = np->flow_label; connected = 1; } - if (!fl->oif) - fl->oif = sk->sk_bound_dev_if; + if (!fl.oif) + fl.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(*opt); - err = datagram_send_ctl(msg, fl, opt, &hlimit, &tclass); + err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass); if (err < 0) { fl6_sock_release(flowlabel); return err; } - if ((fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { - flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); + if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); if (flowlabel == NULL) return -EINVAL; } @@ -695,39 +695,39 @@ do_udp_sendmsg: opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); - fl->proto = IPPROTO_UDP; - ipv6_addr_copy(&fl->fl6_dst, daddr); - if (ipv6_addr_any(&fl->fl6_src) && !ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&fl->fl6_src, &np->saddr); - fl->fl_ip_sport = inet->sport; + fl.proto = IPPROTO_UDP; + ipv6_addr_copy(&fl.fl6_dst, daddr); + if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl_ip_sport = inet->sport; /* merge ip6_build_xmit from ip6_output */ if (opt && opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl->fl6_dst); - ipv6_addr_copy(&fl->fl6_dst, rt0->addr); + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); final_p = &final; connected = 0; } - if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst)) { - fl->oif = np->mcast_oif; + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) { + fl.oif = np->mcast_oif; connected = 0; } - security_sk_classify_flow(sk, fl); + security_sk_classify_flow(sk, &fl); - err = ip6_sk_dst_lookup(sk, &dst, fl); + err = ip6_sk_dst_lookup(sk, &dst, &fl); if (err) goto out; if (final_p) - ipv6_addr_copy(&fl->fl6_dst, final_p); + ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = xfrm_lookup(&dst, fl, sk, 0)) < 0) + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) goto out; if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl->fl6_dst)) + if (ipv6_addr_is_multicast(&fl.fl6_dst)) hlimit = np->mcast_hops; else hlimit = np->hop_limit; @@ -763,7 +763,7 @@ back_from_confirm: do_append_data: up->len += ulen; err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, - sizeof(struct udphdr), hlimit, tclass, opt, fl, + sizeof(struct udphdr), hlimit, tclass, opt, &fl, (struct rt6_info*)dst, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) @@ -774,10 +774,10 @@ do_append_data: if (dst) { if (connected) { ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl->fl6_dst, &np->daddr) ? + ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? &np->daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_equal(&fl->fl6_src, &np->saddr) ? + ipv6_addr_equal(&fl.fl6_src, &np->saddr) ? &np->saddr : #endif NULL); -- cgit v1.2.3-70-g09d2 From 1e0c14f49d6b393179f423abbac47f85618d3d46 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 3 Oct 2006 14:35:49 -0700 Subject: [UDP]: Fix MSG_PROBE crash UDP tracks corking status through the pending variable. The IP layer also tracks it through the socket write queue. It is possible for the two to get out of sync when MSG_PROBE is used. This patch changes UDP to check the write queue to ensure that the two stay in sync. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 ++ net/ipv6/udp.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6d6142f9c47..865d75214a9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -675,6 +675,8 @@ do_append_data: udp_flush_pending_frames(sk); else if (!corkreq) err = udp_push_pending_frames(sk, up); + else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) + up->pending = 0; release_sock(sk); out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 552ec0f449a..e0c3934a7e4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -770,6 +770,8 @@ do_append_data: udp_v6_flush_pending_frames(sk); else if (!corkreq) err = udp_v6_push_pending_frames(sk, up); + else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) + up->pending = 0; if (dst) { if (connected) { -- cgit v1.2.3-70-g09d2 From c5e29460f5f9eb189cab5d9fdaa137e64f7734b6 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 3 Oct 2006 15:49:46 -0700 Subject: [NEIGH]: always use hash_mask under tbl lock Make sure hash_mask is protected with tbl->lock in all cases just like the hash_buckets. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/core/neighbour.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8ce8c471d86..b4b478353b2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -344,12 +344,12 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, { struct neighbour *n; int key_len = tbl->key_len; - u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + u32 hash_val = tbl->hash(pkey, dev); NEIGH_CACHE_STAT_INC(tbl, lookups); read_lock_bh(&tbl->lock); - for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { neigh_hold(n); NEIGH_CACHE_STAT_INC(tbl, hits); @@ -364,12 +364,12 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey) { struct neighbour *n; int key_len = tbl->key_len; - u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask; + u32 hash_val = tbl->hash(pkey, NULL); NEIGH_CACHE_STAT_INC(tbl, lookups); read_lock_bh(&tbl->lock); - for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { if (!memcmp(n->primary_key, pkey, key_len)) { neigh_hold(n); NEIGH_CACHE_STAT_INC(tbl, hits); @@ -1998,12 +1998,12 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; + read_lock_bh(&tbl->lock); for (h = 0; h <= tbl->hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; - read_lock_bh(&tbl->lock); for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) { if (idx < s_idx) continue; @@ -2016,8 +2016,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, goto out; } } - read_unlock_bh(&tbl->lock); } + read_unlock_bh(&tbl->lock); rc = skb->len; out: cb->args[1] = h; -- cgit v1.2.3-70-g09d2 From 667bbcb6c099d1b74f95c6963ddf37a32e7afc29 Mon Sep 17 00:00:00 2001 From: Masahide NAKAMURA Date: Tue, 3 Oct 2006 15:56:09 -0700 Subject: [XFRM] STATE: Use destination address for src hash. Src hash is introduced for Mobile IPv6 route optimization usage. On current kenrel code it is calculated with source address only. It results we uses the same hash value for outbound state (when the node has only one address for Mobile IPv6). This patch use also destination address as peer information for src hash to be dispersed. Signed-off-by: Masahide NAKAMURA Signed-off-by: David S. Miller --- net/xfrm/xfrm_hash.h | 7 ++++--- net/xfrm/xfrm_state.c | 16 +++++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index 6ac4e4f033a..d401dc8f05e 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -41,17 +41,18 @@ static inline unsigned int __xfrm_dst_hash(xfrm_address_t *daddr, xfrm_address_t return (h ^ (h >> 16)) & hmask; } -static inline unsigned __xfrm_src_hash(xfrm_address_t *saddr, +static inline unsigned __xfrm_src_hash(xfrm_address_t *daddr, + xfrm_address_t *saddr, unsigned short family, unsigned int hmask) { unsigned int h = family; switch (family) { case AF_INET: - h ^= __xfrm4_addr_hash(saddr); + h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); break; case AF_INET6: - h ^= __xfrm6_addr_hash(saddr); + h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); break; }; return (h ^ (h >> 16)) & hmask; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f927b7330f0..39b8bf3a9de 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -63,10 +63,11 @@ static inline unsigned int xfrm_dst_hash(xfrm_address_t *daddr, return __xfrm_dst_hash(daddr, saddr, reqid, family, xfrm_state_hmask); } -static inline unsigned int xfrm_src_hash(xfrm_address_t *addr, +static inline unsigned int xfrm_src_hash(xfrm_address_t *daddr, + xfrm_address_t *saddr, unsigned short family) { - return __xfrm_src_hash(addr, family, xfrm_state_hmask); + return __xfrm_src_hash(daddr, saddr, family, xfrm_state_hmask); } static inline unsigned int @@ -92,7 +93,8 @@ static void xfrm_hash_transfer(struct hlist_head *list, nhashmask); hlist_add_head(&x->bydst, ndsttable+h); - h = __xfrm_src_hash(&x->props.saddr, x->props.family, + h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, + x->props.family, nhashmask); hlist_add_head(&x->bysrc, nsrctable+h); @@ -458,7 +460,7 @@ static struct xfrm_state *__xfrm_state_lookup(xfrm_address_t *daddr, __be32 spi, static struct xfrm_state *__xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto, unsigned short family) { - unsigned int h = xfrm_src_hash(saddr, family); + unsigned int h = xfrm_src_hash(daddr, saddr, family); struct xfrm_state *x; struct hlist_node *entry; @@ -587,7 +589,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, if (km_query(x, tmpl, pol) == 0) { x->km.state = XFRM_STATE_ACQ; hlist_add_head(&x->bydst, xfrm_state_bydst+h); - h = xfrm_src_hash(saddr, family); + h = xfrm_src_hash(daddr, saddr, family); hlist_add_head(&x->bysrc, xfrm_state_bysrc+h); if (x->id.spi) { h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, family); @@ -622,7 +624,7 @@ static void __xfrm_state_insert(struct xfrm_state *x) x->props.reqid, x->props.family); hlist_add_head(&x->bydst, xfrm_state_bydst+h); - h = xfrm_src_hash(&x->props.saddr, x->props.family); + h = xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family); hlist_add_head(&x->bysrc, xfrm_state_bysrc+h); if (x->id.spi) { @@ -748,7 +750,7 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ; add_timer(&x->timer); hlist_add_head(&x->bydst, xfrm_state_bydst+h); - h = xfrm_src_hash(saddr, family); + h = xfrm_src_hash(daddr, saddr, family); hlist_add_head(&x->bysrc, xfrm_state_bysrc+h); wake_up(&km_waitq); } -- cgit v1.2.3-70-g09d2 From ae8c05779ac2f286b872db9ebea0c3c0a031ad1e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 3 Oct 2006 16:00:26 -0700 Subject: [XFRM]: Clearing xfrm_policy_count[] to zero during flush is incorrect. When we flush policies, we do a type match so we might not actually delete all policies matching a certain direction. So keep track of how many policies we actually kill and subtract that number from xfrm_policy_count[dir] at the end. Based upon a patch by Masahide NAKAMURA. Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b6e2e79d726..2a7861661f1 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -778,8 +778,9 @@ void xfrm_policy_flush(u8 type) for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy *pol; struct hlist_node *entry; - int i; + int i, killed; + killed = 0; again1: hlist_for_each_entry(pol, entry, &xfrm_policy_inexact[dir], bydst) { @@ -790,6 +791,7 @@ void xfrm_policy_flush(u8 type) write_unlock_bh(&xfrm_policy_lock); xfrm_policy_kill(pol); + killed++; write_lock_bh(&xfrm_policy_lock); goto again1; @@ -807,13 +809,14 @@ void xfrm_policy_flush(u8 type) write_unlock_bh(&xfrm_policy_lock); xfrm_policy_kill(pol); + killed++; write_lock_bh(&xfrm_policy_lock); goto again2; } } - xfrm_policy_count[dir] = 0; + xfrm_policy_count[dir] -= killed; } atomic_inc(&flow_cache_genid); write_unlock_bh(&xfrm_policy_lock); -- cgit v1.2.3-70-g09d2 From 617dbeaa3f2987acc83c1149409685005e9dd740 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Tue, 3 Oct 2006 16:25:34 -0700 Subject: [TIPC]: fix printk warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc spits out this warning: net/tipc/link.c: In function ‘link_retransmit_failure’: net/tipc/link.c:1669: warning: cast from pointer to integer of different size More than a little bit ugly, storing integers in void*, but at least the code is correct, unlike some of the more crufty Linux kernel code found elsewhere. Rather than having two casts to massage the value into u32, it's easier just to have a single cast and use "%lu", since it's just a printk. Signed-off-by: Jeff Garzik Signed-off-by: David S. Miller --- net/tipc/link.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 693f02eca6d..53bc8cb5adb 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1666,8 +1666,9 @@ static void link_retransmit_failure(struct link *l_ptr, struct sk_buff *buf) char addr_string[16]; tipc_printf(TIPC_OUTPUT, "Msg seq number: %u, ", msg_seqno(msg)); - tipc_printf(TIPC_OUTPUT, "Outstanding acks: %u\n", (u32)TIPC_SKB_CB(buf)->handle); - + tipc_printf(TIPC_OUTPUT, "Outstanding acks: %lu\n", + (unsigned long) TIPC_SKB_CB(buf)->handle); + n_ptr = l_ptr->owner->next; tipc_node_lock(n_ptr); -- cgit v1.2.3-70-g09d2 From 2473ffe3cae0f86341958e3cf962bb4fc261d028 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 3 Oct 2006 16:29:52 -0700 Subject: [NET_SCHED]: Remove old estimator implementation Remove unused file, estimators live in net/core/gen_estimator.c now. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/sched/estimator.c | 196 -------------------------------------------------- 1 file changed, 196 deletions(-) delete mode 100644 net/sched/estimator.c (limited to 'net') diff --git a/net/sched/estimator.c b/net/sched/estimator.c deleted file mode 100644 index 0ebc98e9be2..00000000000 --- a/net/sched/estimator.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * net/sched/estimator.c Simple rate estimator. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Authors: Alexey Kuznetsov, - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - This code is NOT intended to be used for statistics collection, - its purpose is to provide a base for statistical multiplexing - for controlled load service. - If you need only statistics, run a user level daemon which - periodically reads byte counters. - - Unfortunately, rate estimation is not a very easy task. - F.e. I did not find a simple way to estimate the current peak rate - and even failed to formulate the problem 8)8) - - So I preferred not to built an estimator into the scheduler, - but run this task separately. - Ideally, it should be kernel thread(s), but for now it runs - from timers, which puts apparent top bounds on the number of rated - flows, has minimal overhead on small, but is enough - to handle controlled load service, sets of aggregates. - - We measure rate over A=(1<next) { - struct tc_stats *st = e->stats; - u64 nbytes; - u32 npackets; - u32 rate; - - spin_lock(e->stats_lock); - nbytes = st->bytes; - npackets = st->packets; - rate = (nbytes - e->last_bytes)<<(7 - idx); - e->last_bytes = nbytes; - e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; - st->bps = (e->avbps+0xF)>>5; - - rate = (npackets - e->last_packets)<<(12 - idx); - e->last_packets = npackets; - e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; - e->stats->pps = (e->avpps+0x1FF)>>10; - spin_unlock(e->stats_lock); - } - - mod_timer(&elist[idx].timer, jiffies + ((HZ<interval < -2 || parm->interval > 3) - return -EINVAL; - - est = kzalloc(sizeof(*est), GFP_KERNEL); - if (est == NULL) - return -ENOBUFS; - - est->interval = parm->interval + 2; - est->stats = stats; - est->stats_lock = stats_lock; - est->ewma_log = parm->ewma_log; - est->last_bytes = stats->bytes; - est->avbps = stats->bps<<5; - est->last_packets = stats->packets; - est->avpps = stats->pps<<10; - - est->next = elist[est->interval].list; - if (est->next == NULL) { - init_timer(&elist[est->interval].timer); - elist[est->interval].timer.data = est->interval; - elist[est->interval].timer.expires = jiffies + ((HZ<interval)/4); - elist[est->interval].timer.function = est_timer; - add_timer(&elist[est->interval].timer); - } - write_lock_bh(&est_lock); - elist[est->interval].list = est; - write_unlock_bh(&est_lock); - return 0; -} - -void qdisc_kill_estimator(struct tc_stats *stats) -{ - int idx; - struct qdisc_estimator *est, **pest; - - for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { - int killed = 0; - pest = &elist[idx].list; - while ((est=*pest) != NULL) { - if (est->stats != stats) { - pest = &est->next; - continue; - } - - write_lock_bh(&est_lock); - *pest = est->next; - write_unlock_bh(&est_lock); - - kfree(est); - killed++; - } - if (killed && elist[idx].list == NULL) - del_timer(&elist[idx].timer); - } -} - -EXPORT_SYMBOL(qdisc_kill_estimator); -EXPORT_SYMBOL(qdisc_new_estimator); -- cgit v1.2.3-70-g09d2 From 80246ab36ec8baf7d107254adb166baa555a59f8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 3 Oct 2006 16:49:53 -0700 Subject: [TCP]: Kill warning in tcp_clean_rtx_queue(). GCC can't tell we always initialize 'tv' in all the cases we actually use it, so explicitly set it up with zeros. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3f884cea14f..cf06accbe68 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2259,7 +2259,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) u32 pkts_acked = 0; void (*rtt_sample)(struct sock *sk, u32 usrtt) = icsk->icsk_ca_ops->rtt_sample; - struct timeval tv; + struct timeval tv = { .tv_sec = 0, .tv_usec = 0 }; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { -- cgit v1.2.3-70-g09d2 From 0a69452cb45add0841c2bc1e75c25f6bd4f1d8d9 Mon Sep 17 00:00:00 2001 From: Diego Beltrami Date: Tue, 3 Oct 2006 23:47:05 -0700 Subject: [XFRM]: BEET mode This patch introduces the BEET mode (Bound End-to-End Tunnel) with as specified by the ietf draft at the following link: http://www.ietf.org/internet-drafts/draft-nikander-esp-beet-mode-06.txt The patch provides only single family support (i.e. inner family = outer family). Signed-off-by: Diego Beltrami Signed-off-by: Miika Komu Signed-off-by: Herbert Xu Signed-off-by: Abhinav Pathak Signed-off-by: Jeff Ahrenholz Signed-off-by: David S. Miller --- include/linux/in.h | 1 + include/linux/ip.h | 9 +++ include/linux/ipsec.h | 3 +- include/linux/xfrm.h | 3 +- net/ipv4/Kconfig | 9 +++ net/ipv4/Makefile | 1 + net/ipv4/esp4.c | 26 ++++++--- net/ipv4/ipcomp.c | 5 +- net/ipv4/xfrm4_mode_beet.c | 139 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/Kconfig | 10 ++++ net/ipv6/Makefile | 1 + net/ipv6/ipcomp6.c | 5 +- net/ipv6/xfrm6_mode_beet.c | 107 ++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_user.c | 1 + 14 files changed, 309 insertions(+), 11 deletions(-) create mode 100644 net/ipv4/xfrm4_mode_beet.c create mode 100644 net/ipv6/xfrm6_mode_beet.c (limited to 'net') diff --git a/include/linux/in.h b/include/linux/in.h index d79fc75fa7c..2619859f6e1 100644 --- a/include/linux/in.h +++ b/include/linux/in.h @@ -40,6 +40,7 @@ enum { IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */ IPPROTO_AH = 51, /* Authentication Header protocol */ + IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */ IPPROTO_PIM = 103, /* Protocol Independent Multicast */ IPPROTO_COMP = 108, /* Compression Header protocol */ diff --git a/include/linux/ip.h b/include/linux/ip.h index 6b25d36fc54..ecee9bb27d0 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -80,6 +80,8 @@ #define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ #define IPOPT_TS_PRESPEC 3 /* specified modules only */ +#define IPV4_BEET_PHMAXLEN 8 + struct iphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 ihl:4, @@ -123,4 +125,11 @@ struct ip_comp_hdr { __be16 cpi; }; +struct ip_beet_phdr { + __u8 nexthdr; + __u8 hdrlen; + __u8 padlen; + __u8 reserved; +}; + #endif /* _LINUX_IP_H */ diff --git a/include/linux/ipsec.h b/include/linux/ipsec.h index d3c527616b5..d17a6302a0e 100644 --- a/include/linux/ipsec.h +++ b/include/linux/ipsec.h @@ -12,7 +12,8 @@ enum { IPSEC_MODE_ANY = 0, /* We do not support this for SA */ IPSEC_MODE_TRANSPORT = 1, - IPSEC_MODE_TUNNEL = 2 + IPSEC_MODE_TUNNEL = 2, + IPSEC_MODE_BEET = 3 }; enum { diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 430afd05826..8ae7f744917 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -129,7 +129,8 @@ enum #define XFRM_MODE_TUNNEL 1 #define XFRM_MODE_ROUTEOPTIMIZATION 2 #define XFRM_MODE_IN_TRIGGER 3 -#define XFRM_MODE_MAX 4 +#define XFRM_MODE_BEET 4 +#define XFRM_MODE_MAX 5 /* Netlink configuration messages. */ enum { diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index d172a980444..5572071af73 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -434,6 +434,15 @@ config INET_XFRM_MODE_TUNNEL If unsure, say Y. +config INET_XFRM_MODE_BEET + tristate "IP: IPsec BEET mode" + default y + select XFRM + ---help--- + Support for IPsec BEET mode. + + If unsure, say Y. + config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f66049e28ae..15645c51520 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_INET_AH) += ah4.o obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o +obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 13b29360d10..b5c205b5766 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -253,7 +253,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) * as per draft-ietf-ipsec-udp-encaps-06, * section 3.1.2 */ - if (x->props.mode == XFRM_MODE_TRANSPORT) + if (x->props.mode == XFRM_MODE_TRANSPORT || + x->props.mode == XFRM_MODE_BEET) skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -271,17 +272,28 @@ static u32 esp4_get_max_size(struct xfrm_state *x, int mtu) { struct esp_data *esp = x->data; u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4); - - if (x->props.mode == XFRM_MODE_TUNNEL) { - mtu = ALIGN(mtu + 2, blksize); - } else { - /* The worst case. */ + int enclen = 0; + + switch (x->props.mode) { + case XFRM_MODE_TUNNEL: + mtu = ALIGN(mtu +2, blksize); + break; + default: + case XFRM_MODE_TRANSPORT: + /* The worst case */ mtu = ALIGN(mtu + 2, 4) + blksize - 4; + break; + case XFRM_MODE_BEET: + /* The worst case. */ + enclen = IPV4_BEET_PHMAXLEN; + mtu = ALIGN(mtu + enclen + 2, blksize); + break; } + if (esp->conf.padlen) mtu = ALIGN(mtu, esp->conf.padlen); - return mtu + x->props.header_len + esp->auth.icv_trunc_len; + return mtu + x->props.header_len + esp->auth.icv_trunc_len - enclen; } static void esp4_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 2017d36024d..3839b706142 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -206,6 +206,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct xfrm_state *t; + u8 mode = XFRM_MODE_TUNNEL; t = xfrm_state_alloc(); if (t == NULL) @@ -216,7 +217,9 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->id.daddr.a4 = x->id.daddr.a4; memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET; - t->props.mode = XFRM_MODE_TUNNEL; + if (x->props.mode == XFRM_MODE_BEET) + mode = x->props.mode; + t->props.mode = mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c new file mode 100644 index 00000000000..89cf59ea7bb --- /dev/null +++ b/net/ipv4/xfrm4_mode_beet.c @@ -0,0 +1,139 @@ +/* + * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4. + * + * Copyright (c) 2006 Diego Beltrami + * Miika Komu + * Herbert Xu + * Abhinav Pathak + * Jeff Ahrenholz + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. + * The following fields in it shall be filled in by x->type->output: + * tot_len + * check + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph, *top_iph = NULL; + int hdrlen, optlen; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + hdrlen = 0; + optlen = iph->ihl * 4 - sizeof(*iph); + if (unlikely(optlen)) + hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); + + skb->nh.raw = skb_push(skb, x->props.header_len + hdrlen); + top_iph = skb->nh.iph; + hdrlen = iph->ihl * 4 - optlen; + skb->h.raw += hdrlen; + + memmove(top_iph, iph, hdrlen); + if (unlikely(optlen)) { + struct ip_beet_phdr *ph; + + BUG_ON(optlen < 0); + + ph = (struct ip_beet_phdr *)skb->h.raw; + ph->padlen = 4 - (optlen & 4); + ph->hdrlen = (optlen + ph->padlen + sizeof(*ph)) / 8; + ph->nexthdr = top_iph->protocol; + + top_iph->protocol = IPPROTO_BEETPH; + top_iph->ihl = sizeof(struct iphdr) / 4; + } + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + + return 0; +} + +static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + int phlen = 0; + int optlen = 0; + __u8 ph_nexthdr = 0, protocol = 0; + int err = -EINVAL; + + protocol = iph->protocol; + + if (unlikely(iph->protocol == IPPROTO_BEETPH)) { + struct ip_beet_phdr *ph = (struct ip_beet_phdr*)(iph + 1); + + if (!pskb_may_pull(skb, sizeof(*ph))) + goto out; + + phlen = ph->hdrlen * 8; + optlen = phlen - ph->padlen - sizeof(*ph); + if (optlen < 0 || optlen & 3 || optlen > 250) + goto out; + + if (!pskb_may_pull(skb, phlen)) + goto out; + + ph_nexthdr = ph->nexthdr; + } + + skb_push(skb, sizeof(*iph) - phlen + optlen); + memmove(skb->data, skb->nh.raw, sizeof(*iph)); + skb->nh.raw = skb->data; + + iph = skb->nh.iph; + iph->ihl = (sizeof(*iph) + optlen) / 4; + iph->tot_len = htons(skb->len); + iph->daddr = x->sel.daddr.a4; + iph->saddr = x->sel.saddr.a4; + if (ph_nexthdr) + iph->protocol = ph_nexthdr; + else + iph->protocol = protocol; + iph->check = 0; + iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + err = 0; +out: + return err; +} + +static struct xfrm_mode xfrm4_beet_mode = { + .input = xfrm4_beet_input, + .output = xfrm4_beet_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_BEET, +}; + +static int __init xfrm4_beet_init(void) +{ + return xfrm_register_mode(&xfrm4_beet_mode, AF_INET); +} + +static void __exit xfrm4_beet_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_beet_init); +module_exit(xfrm4_beet_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index a2d211da2ab..a460e8132b4 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -136,6 +136,16 @@ config INET6_XFRM_MODE_TUNNEL If unsure, say Y. +config INET6_XFRM_MODE_BEET + tristate "IPv6: IPsec BEET mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec BEET mode. + + If unsure, say Y. + config INET6_XFRM_MODE_ROUTEOPTIMIZATION tristate "IPv6: MIPv6 route optimization mode (EXPERIMENTAL)" depends on IPV6 && EXPERIMENTAL diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 0213c6612b5..87274e47fe3 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o +obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index a2860e35efd..71f59f18ede 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -199,6 +199,7 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct xfrm_state *t = NULL; + u8 mode = XFRM_MODE_TUNNEL; t = xfrm_state_alloc(); if (!t) @@ -212,7 +213,9 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr)); memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET6; - t->props.mode = XFRM_MODE_TUNNEL; + if (x->props.mode == XFRM_MODE_BEET) + mode = x->props.mode; + t->props.mode = mode; memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); if (xfrm_init_state(t)) diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c new file mode 100644 index 00000000000..edcfffa9e87 --- /dev/null +++ b/net/ipv6/xfrm6_mode_beet.c @@ -0,0 +1,107 @@ +/* + * xfrm6_mode_beet.c - BEET mode encapsulation for IPv6. + * + * Copyright (c) 2006 Diego Beltrami + * Miika Komu + * Herbert Xu + * Abhinav Pathak + * Jeff Ahrenholz + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. + * The following fields in it shall be filled in by x->type->output: + * payload_len + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph, *top_iph; + u8 *prevhdr; + int hdr_len; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + skb->nh.raw = prevhdr - x->props.header_len; + skb->h.raw = skb->data + hdr_len; + memmove(skb->data, iph, hdr_len); + + skb->nh.raw = skb->data; + top_iph = skb->nh.ipv6h; + skb->nh.raw = &top_iph->nexthdr; + skb->h.ipv6h = top_iph + 1; + + ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + + return 0; +} + +static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *ip6h; + int size = sizeof(struct ipv6hdr); + int err = -EINVAL; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + skb_push(skb, size); + memmove(skb->data, skb->nh.raw, size); + skb->nh.raw = skb->data; + + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + + ip6h = skb->nh.ipv6h; + ip6h->payload_len = htons(skb->len - size); + ipv6_addr_copy(&ip6h->daddr, (struct in6_addr *) &x->sel.daddr.a6); + ipv6_addr_copy(&ip6h->saddr, (struct in6_addr *) &x->sel.saddr.a6); + err = 0; +out: + return err; +} + +static struct xfrm_mode xfrm6_beet_mode = { + .input = xfrm6_beet_input, + .output = xfrm6_beet_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_BEET, +}; + +static int __init xfrm6_beet_init(void) +{ + return xfrm_register_mode(&xfrm6_beet_mode, AF_INET6); +} + +static void __exit xfrm6_beet_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_beet_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_beet_init); +module_exit(xfrm6_beet_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_BEET); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c59a78d2923..d54b3a70d5d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -211,6 +211,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, case XFRM_MODE_TRANSPORT: case XFRM_MODE_TUNNEL: case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_BEET: break; default: -- cgit v1.2.3-70-g09d2