From aa3c487f355ff1477b8369d9f0b9860387ae21d4 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Thu, 29 Oct 2009 15:35:10 +0100 Subject: netfilter: xt_socket: make module available for INPUT chain This should make it possible to test for the existence of local sockets in the INPUT path. References: http://marc.info/?l=netfilter-devel&m=125380481517129&w=2 Signed-off-by: Jan Engelhardt Signed-off-by: Balazs Scheidler Signed-off-by: Patrick McHardy --- net/netfilter/xt_socket.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 362afbd60a9..6a902564d24 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -192,7 +192,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .revision = 0, .family = NFPROTO_IPV4, .match = socket_mt_v0, - .hooks = 1 << NF_INET_PRE_ROUTING, + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { @@ -201,7 +202,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = socket_mt_v1, .matchsize = sizeof(struct xt_socket_mtinfo1), - .hooks = 1 << NF_INET_PRE_ROUTING, + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, }; -- cgit v1.2.3-70-g09d2 From 5ae27aa2b16478a84d833ab4065798e752941c5a Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 5 Nov 2009 14:51:31 +0100 Subject: netfilter: nf_conntrack: avoid additional compare. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7c9ec3dee96..8e572d7c08c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -511,11 +511,17 @@ static noinline int early_drop(struct net *net, unsigned int hash) cnt++; } - if (ct && unlikely(nf_ct_is_dying(ct) || - !atomic_inc_not_zero(&ct->ct_general.use))) - ct = NULL; - if (ct || cnt >= NF_CT_EVICTION_RANGE) + if (ct != NULL) { + if (likely(!nf_ct_is_dying(ct) && + atomic_inc_not_zero(&ct->ct_general.use))) + break; + else + ct = NULL; + } + + if (cnt >= NF_CT_EVICTION_RANGE) break; + hash = (hash + 1) % nf_conntrack_htable_size; } rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From dee5817e88ac8195e5938d6671f434a071e35698 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 6 Nov 2009 17:04:00 +0100 Subject: netfilter: remove unneccessary checks from netlink notifiers The NETLINK_URELEASE notifier is only invoked for bound sockets, so there is no need to check ->pid again. Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ip_queue.c | 3 +-- net/ipv6/netfilter/ip6_queue.c | 3 +-- net/netfilter/nfnetlink_log.c | 3 +-- net/netfilter/nfnetlink_queue.c | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net/netfilter') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 9811a456fb5..9f078709195 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -497,8 +497,7 @@ ipq_rcv_nl_event(struct notifier_block *this, { struct netlink_notify *n = ptr; - if (event == NETLINK_URELEASE && - n->protocol == NETLINK_FIREWALL && n->pid) { + if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { write_lock_bh(&queue_lock); if ((n->net == &init_net) && (n->pid == peer_pid)) __ipq_reset(); diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index a82016fd5d6..47a3623e711 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -499,8 +499,7 @@ ipq_rcv_nl_event(struct notifier_block *this, { struct netlink_notify *n = ptr; - if (event == NETLINK_URELEASE && - n->protocol == NETLINK_IP6_FW && n->pid) { + if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW) { write_lock_bh(&queue_lock); if ((n->net == &init_net) && (n->pid == peer_pid)) __ipq_reset(); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index f900dc3194a..3aa66b2f9e8 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -666,8 +666,7 @@ nfulnl_rcv_nl_event(struct notifier_block *this, { struct netlink_notify *n = ptr; - if (event == NETLINK_URELEASE && - n->protocol == NETLINK_NETFILTER && n->pid) { + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this pid */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 7a9dec9fb82..7e3fa410641 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -574,8 +574,7 @@ nfqnl_rcv_nl_event(struct notifier_block *this, { struct netlink_notify *n = ptr; - if (event == NETLINK_URELEASE && - n->protocol == NETLINK_NETFILTER && n->pid) { + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this pid */ -- cgit v1.2.3-70-g09d2 From c4832c7bbc3f7a4813347e871d7238651bf437d3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 23 Nov 2009 10:34:39 +0100 Subject: netfilter: nf_ct_tcp: improve out-of-sync situation in TCP tracking Without this patch, if we receive a SYN packet from the client while the firewall is out-of-sync, we let it go through. Then, if we see the SYN/ACK reply coming from the server, we destroy the conntrack entry and drop the packet to trigger a new retransmission. Then, the retransmision from the client is used to start a new clean session. This patch improves the current handling. Basically, if we see an unexpected SYN packet, we annotate the TCP options. Then, if we see the reply SYN/ACK, this means that the firewall was indeed out-of-sync. Therefore, we set a clean new session from the existing entry based on the annotated values. This patch adds two new 8-bits fields that fit in a 16-bits gap of the ip_ct_tcp structure. This patch is particularly useful for conntrackd since the asynchronous nature of the state-synchronization allows to have backup nodes that are not perfect copies of the master. This helps to improve the recovery under some worst-case scenarios. I have tested this by creating lots of conntrack entries in wrong state: for ((i=1024;i<65535;i++)); do conntrack -I -p tcp -s 192.168.2.101 -d 192.168.2.2 --sport $i --dport 80 -t 800 --state ESTABLISHED -u ASSURED,SEEN_REPLY; done Then, I make some TCP connections: $ echo GET / | nc 192.168.2.2 80 The events show the result: [UPDATE] tcp 6 60 SYN_RECV src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED] [UPDATE] tcp 6 432000 ESTABLISHED src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED] [UPDATE] tcp 6 120 FIN_WAIT src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED] [UPDATE] tcp 6 30 LAST_ACK src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED] [UPDATE] tcp 6 120 TIME_WAIT src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED] and tcpdump shows no retransmissions: 20:47:57.271951 IP 192.168.2.101.33221 > 192.168.2.2.www: S 435402517:435402517(0) win 5840 20:47:57.273538 IP 192.168.2.2.www > 192.168.2.101.33221: S 3509927945:3509927945(0) ack 435402518 win 5792 20:47:57.273608 IP 192.168.2.101.33221 > 192.168.2.2.www: . ack 3509927946 win 92 20:47:57.273693 IP 192.168.2.101.33221 > 192.168.2.2.www: P 435402518:435402524(6) ack 3509927946 win 92 20:47:57.275492 IP 192.168.2.2.www > 192.168.2.101.33221: . ack 435402524 win 362 20:47:57.276492 IP 192.168.2.2.www > 192.168.2.101.33221: P 3509927946:3509928082(136) ack 435402524 win 362 20:47:57.276515 IP 192.168.2.101.33221 > 192.168.2.2.www: . ack 3509928082 win 108 20:47:57.276521 IP 192.168.2.2.www > 192.168.2.101.33221: F 3509928082:3509928082(0) ack 435402524 win 362 20:47:57.277369 IP 192.168.2.101.33221 > 192.168.2.2.www: F 435402524:435402524(0) ack 3509928083 win 108 20:47:57.279491 IP 192.168.2.2.www > 192.168.2.101.33221: . ack 435402525 win 362 I also added a rule to log invalid packets, with no occurrences :-) . Signed-off-by: Pablo Neira Ayuso Acked-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_tcp.h | 3 ++ net/netfilter/nf_conntrack_proto_tcp.c | 51 ++++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'net/netfilter') diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h index 4352feed237..ece22e94dcb 100644 --- a/include/linux/netfilter/nf_conntrack_tcp.h +++ b/include/linux/netfilter/nf_conntrack_tcp.h @@ -67,6 +67,9 @@ struct ip_ct_tcp u_int32_t last_ack; /* Last sequence number seen in opposite dir */ u_int32_t last_end; /* Last seq + len */ u_int16_t last_win; /* Last window advertisement seen in dir */ + /* For SYN packets while we may be out-of-sync */ + u_int8_t last_wscale; /* Last window scaling factor seen */ + u_int8_t last_flags; /* Last flags set */ }; #endif /* __KERNEL__ */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 97a82ba7537..9cc6b5cb06a 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -908,23 +908,54 @@ static int tcp_packet(struct nf_conn *ct, /* b) This SYN/ACK acknowledges a SYN that we earlier * ignored as invalid. This means that the client and * the server are both in sync, while the firewall is - * not. We kill this session and block the SYN/ACK so - * that the client cannot but retransmit its SYN and - * thus initiate a clean new session. + * not. We get in sync from the previously annotated + * values. */ - spin_unlock_bh(&ct->lock); - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: killing out of sync session "); - nf_ct_kill(ct); - return NF_DROP; + old_state = TCP_CONNTRACK_SYN_SENT; + new_state = TCP_CONNTRACK_SYN_RECV; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = + ct->proto.tcp.last_win == 0 ? + 1 : ct->proto.tcp.last_win; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = + ct->proto.tcp.last_wscale; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = + ct->proto.tcp.last_flags; + memset(&ct->proto.tcp.seen[dir], 0, + sizeof(struct ip_ct_tcp_state)); + break; } ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; ct->proto.tcp.last_seq = ntohl(th->seq); ct->proto.tcp.last_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); - + ct->proto.tcp.last_win = ntohs(th->window); + + /* a) This is a SYN in ORIGINAL. The client and the server + * may be in sync but we are not. In that case, we annotate + * the TCP options and let the packet go through. If it is a + * valid SYN packet, the server will reply with a SYN/ACK, and + * then we'll get in sync. Otherwise, the server ignores it. */ + if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { + struct ip_ct_tcp_state seen = {}; + + ct->proto.tcp.last_flags = + ct->proto.tcp.last_wscale = 0; + tcp_options(skb, dataoff, th, &seen); + if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + ct->proto.tcp.last_wscale = seen.td_scale; + } + if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_SACK_PERM; + } + } spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, -- cgit v1.2.3-70-g09d2 From 3a0429292daa0e1ec848bd26479f5e48b0d54a42 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 23 Nov 2009 10:43:57 +0100 Subject: netfilter: xtables: fix conntrack match v1 ipt-save output commit d6d3f08b0fd998b647a05540cedd11a067b72867 (netfilter: xtables: conntrack match revision 2) does break the v1 conntrack match iptables-save output in a subtle way. Problem is as follows: up = kmalloc(sizeof(*up), GFP_KERNEL); [..] /* * The strategy here is to minimize the overhead of v1 matching, * by prebuilding a v2 struct and putting the pointer into the * v1 dataspace. */ memcpy(up, info, offsetof(typeof(*info), state_mask)); [..] *(void **)info = up; As the v2 struct pointer is saved in the match data space, it clobbers the first structure member (->origsrc_addr). Because the _v1 match function grabs this pointer and does not actually look at the v1 origsrc, run time functionality does not break. But iptables -nvL (or iptables-save) cannot know that v1 origsrc_addr has been overloaded in this way: $ iptables -p tcp -A OUTPUT -m conntrack --ctorigsrc 10.0.0.1 -j ACCEPT $ iptables-save -A OUTPUT -p tcp -m conntrack --ctorigsrc 128.173.134.206 -j ACCEPT (128.173... is the address to the v2 match structure). To fix this, we take advantage of the fact that the v1 and v2 structures are identical with exception of the last two structure members (u8 in v1, u16 in v2). We extract them as early as possible and prevent the v2 matching function from looking at those two members directly. Previously reported by Michel Messerschmidt via Ben Hutchings, also see Debian Bug tracker #556587. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/netfilter/xt_conntrack.c | 61 ++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 44 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 6dc4652f2fe..ae66305f0fe 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -113,7 +113,8 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info, } static bool -conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) +conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par, + u16 state_mask, u16 status_mask) { const struct xt_conntrack_mtinfo2 *info = par->matchinfo; enum ip_conntrack_info ctinfo; @@ -136,7 +137,7 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) if (test_bit(IPS_DST_NAT_BIT, &ct->status)) statebit |= XT_CONNTRACK_STATE_DNAT; } - if (!!(info->state_mask & statebit) ^ + if (!!(state_mask & statebit) ^ !(info->invert_flags & XT_CONNTRACK_STATE)) return false; } @@ -172,7 +173,7 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) return false; if ((info->match_flags & XT_CONNTRACK_STATUS) && - (!!(info->status_mask & ct->status) ^ + (!!(status_mask & ct->status) ^ !(info->invert_flags & XT_CONNTRACK_STATUS))) return false; @@ -192,11 +193,17 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) static bool conntrack_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_conntrack_mtinfo2 *const *info = par->matchinfo; - struct xt_match_param newpar = *par; + const struct xt_conntrack_mtinfo1 *info = par->matchinfo; - newpar.matchinfo = *info; - return conntrack_mt(skb, &newpar); + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + +static bool +conntrack_mt_v2(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct xt_conntrack_mtinfo2 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static bool conntrack_mt_check(const struct xt_mtchk_param *par) @@ -209,45 +216,11 @@ static bool conntrack_mt_check(const struct xt_mtchk_param *par) return true; } -static bool conntrack_mt_check_v1(const struct xt_mtchk_param *par) -{ - struct xt_conntrack_mtinfo1 *info = par->matchinfo; - struct xt_conntrack_mtinfo2 *up; - int ret = conntrack_mt_check(par); - - if (ret < 0) - return ret; - - up = kmalloc(sizeof(*up), GFP_KERNEL); - if (up == NULL) { - nf_ct_l3proto_module_put(par->family); - return -ENOMEM; - } - - /* - * The strategy here is to minimize the overhead of v1 matching, - * by prebuilding a v2 struct and putting the pointer into the - * v1 dataspace. - */ - memcpy(up, info, offsetof(typeof(*info), state_mask)); - up->state_mask = info->state_mask; - up->status_mask = info->status_mask; - *(void **)info = up; - return true; -} - static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_l3proto_module_put(par->family); } -static void conntrack_mt_destroy_v1(const struct xt_mtdtor_param *par) -{ - struct xt_conntrack_mtinfo2 **info = par->matchinfo; - kfree(*info); - conntrack_mt_destroy(par); -} - static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", @@ -255,8 +228,8 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo1), .match = conntrack_mt_v1, - .checkentry = conntrack_mt_check_v1, - .destroy = conntrack_mt_destroy_v1, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, { @@ -264,7 +237,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { .revision = 2, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo2), - .match = conntrack_mt, + .match = conntrack_mt_v2, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, -- cgit v1.2.3-70-g09d2