summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/9p/trans_rdma.c1
-rw-r--r--net/Kconfig6
-rw-r--r--net/batman-adv/Makefile2
-rw-r--r--net/batman-adv/aggregation.c2
-rw-r--r--net/batman-adv/aggregation.h2
-rw-r--r--net/batman-adv/bat_debugfs.c6
-rw-r--r--net/batman-adv/bat_debugfs.h2
-rw-r--r--net/batman-adv/bat_sysfs.c2
-rw-r--r--net/batman-adv/bat_sysfs.h2
-rw-r--r--net/batman-adv/bitarray.c2
-rw-r--r--net/batman-adv/bitarray.h2
-rw-r--r--net/batman-adv/gateway_client.c2
-rw-r--r--net/batman-adv/gateway_client.h2
-rw-r--r--net/batman-adv/gateway_common.c2
-rw-r--r--net/batman-adv/gateway_common.h2
-rw-r--r--net/batman-adv/hard-interface.c13
-rw-r--r--net/batman-adv/hard-interface.h6
-rw-r--r--net/batman-adv/hash.c2
-rw-r--r--net/batman-adv/hash.h7
-rw-r--r--net/batman-adv/icmp_socket.c3
-rw-r--r--net/batman-adv/icmp_socket.h4
-rw-r--r--net/batman-adv/main.c3
-rw-r--r--net/batman-adv/main.h23
-rw-r--r--net/batman-adv/originator.c4
-rw-r--r--net/batman-adv/originator.h2
-rw-r--r--net/batman-adv/packet.h17
-rw-r--r--net/batman-adv/ring_buffer.c2
-rw-r--r--net/batman-adv/ring_buffer.h2
-rw-r--r--net/batman-adv/routing.c27
-rw-r--r--net/batman-adv/routing.h7
-rw-r--r--net/batman-adv/send.c7
-rw-r--r--net/batman-adv/send.h4
-rw-r--r--net/batman-adv/soft-interface.c3
-rw-r--r--net/batman-adv/soft-interface.h2
-rw-r--r--net/batman-adv/translation-table.c3
-rw-r--r--net/batman-adv/translation-table.h4
-rw-r--r--net/batman-adv/types.h6
-rw-r--r--net/batman-adv/unicast.c55
-rw-r--r--net/batman-adv/unicast.h25
-rw-r--r--net/batman-adv/vis.c16
-rw-r--r--net/batman-adv/vis.h2
-rw-r--r--net/bridge/br_device.c17
-rw-r--r--net/bridge/br_fdb.c4
-rw-r--r--net/bridge/br_if.c15
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_multicast.c19
-rw-r--r--net/bridge/br_private.h5
-rw-r--r--net/bridge/netfilter/ebt_ip6.c46
-rw-r--r--net/bridge/netfilter/ebtables.c1
-rw-r--r--net/caif/cfcnfg.c11
-rw-r--r--net/caif/cfdgml.c1
-rw-r--r--net/caif/cfserl.c1
-rw-r--r--net/caif/cfutill.c2
-rw-r--r--net/caif/cfveil.c2
-rw-r--r--net/caif/chnl_net.c4
-rw-r--r--net/can/bcm.c3
-rw-r--r--net/can/raw.c3
-rw-r--r--net/core/dev.c406
-rw-r--r--net/core/dst.c43
-rw-r--r--net/core/ethtool.c531
-rw-r--r--net/core/filter.c6
-rw-r--r--net/core/neighbour.c13
-rw-r--r--net/core/net-sysfs.c17
-rw-r--r--net/core/pktgen.c234
-rw-r--r--net/core/rtnetlink.c94
-rw-r--r--net/core/skbuff.c13
-rw-r--r--net/dcb/dcbnl.c22
-rw-r--r--net/dccp/ccids/ccid2.c9
-rw-r--r--net/decnet/dn_route.c22
-rw-r--r--net/decnet/dn_table.c1
-rw-r--r--net/dsa/dsa.c2
-rw-r--r--net/econet/af_econet.c4
-rw-r--r--net/ipv4/Kconfig42
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c18
-rw-r--r--net/ipv4/ah4.c25
-rw-r--r--net/ipv4/arp.c11
-rw-r--r--net/ipv4/devinet.c108
-rw-r--r--net/ipv4/fib_frontend.c63
-rw-r--r--net/ipv4/fib_hash.c1133
-rw-r--r--net/ipv4/fib_lookup.h2
-rw-r--r--net/ipv4/fib_rules.c12
-rw-r--r--net/ipv4/fib_semantics.c125
-rw-r--r--net/ipv4/fib_trie.c217
-rw-r--r--net/ipv4/icmp.c49
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c2
-rw-r--r--net/ipv4/inetpeer.c54
-rw-r--r--net/ipv4/ip_gre.c1
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/ipmr.c76
-rw-r--r--net/ipv4/netfilter/Kconfig3
-rw-r--r--net/ipv4/netfilter/arp_tables.c2
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c6
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c7
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c3
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c33
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/raw.c19
-rw-r--r--net/ipv4/route.c733
-rw-r--r--net/ipv4/tcp.c11
-rw-r--r--net/ipv4/tcp_input.c4
-rw-r--r--net/ipv4/tcp_ipv4.c8
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv4/xfrm4_policy.c4
-rw-r--r--net/ipv6/addrconf.c84
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/icmp.c16
-rw-r--r--net/ipv6/ip6_output.c5
-rw-r--r--net/ipv6/ip6mr.c75
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter/ip6_tables.c2
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c5
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c3
-rw-r--r--net/ipv6/raw.c33
-rw-r--r--net/ipv6/route.c88
-rw-r--r--net/ipv6/sit.c23
-rw-r--r--net/ipv6/sysctl_net_ipv6.c9
-rw-r--r--net/ipv6/tcp_ipv6.c6
-rw-r--r--net/ipv6/udp.c2
-rw-r--r--net/ipv6/xfrm6_policy.c8
-rw-r--r--net/mac80211/Kconfig6
-rw-r--r--net/netfilter/Kconfig66
-rw-r--r--net/netfilter/Makefile9
-rw-r--r--net/netfilter/core.c23
-rw-r--r--net/netfilter/ipset/Kconfig121
-rw-r--r--net/netfilter/ipset/Makefile24
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c587
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c652
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c515
-rw-r--r--net/netfilter/ipset/ip_set_core.c1671
-rw-r--r--net/netfilter/ipset/ip_set_getport.c141
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c464
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c544
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c562
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c628
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c458
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c578
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c584
-rw-r--r--net/netfilter/ipset/pfxlen.c291
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c98
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c195
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c376
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c892
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c134
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c61
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c67
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c72
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c17
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c129
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c45
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c153
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c142
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c110
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c1239
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c26
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c82
-rw-r--r--net/netfilter/nf_conntrack_core.c68
-rw-r--r--net/netfilter/nf_conntrack_ecache.c3
-rw-r--r--net/netfilter/nf_conntrack_expect.c34
-rw-r--r--net/netfilter/nf_conntrack_extend.c11
-rw-r--r--net/netfilter/nf_conntrack_helper.c20
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c74
-rw-r--r--net/netfilter/nf_conntrack_netlink.c54
-rw-r--r--net/netfilter/nf_conntrack_proto.c24
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c3
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c14
-rw-r--r--net/netfilter/nf_conntrack_snmp.c77
-rw-r--r--net/netfilter/nf_conntrack_standalone.c45
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c120
-rw-r--r--net/netfilter/nf_log.c6
-rw-r--r--net/netfilter/nf_queue.c82
-rw-r--r--net/netfilter/nf_tproxy_core.c27
-rw-r--r--net/netfilter/nfnetlink_log.c6
-rw-r--r--net/netfilter/nfnetlink_queue.c22
-rw-r--r--net/netfilter/x_tables.c98
-rw-r--r--net/netfilter/xt_AUDIT.c204
-rw-r--r--net/netfilter/xt_CLASSIFY.c36
-rw-r--r--net/netfilter/xt_IDLETIMER.c2
-rw-r--r--net/netfilter/xt_LED.c2
-rw-r--r--net/netfilter/xt_NFQUEUE.c34
-rw-r--r--net/netfilter/xt_TPROXY.c22
-rw-r--r--net/netfilter/xt_connlimit.c62
-rw-r--r--net/netfilter/xt_conntrack.c75
-rw-r--r--net/netfilter/xt_cpu.c2
-rw-r--r--net/netfilter/xt_devgroup.c82
-rw-r--r--net/netfilter/xt_iprange.c34
-rw-r--r--net/netfilter/xt_ipvs.c2
-rw-r--r--net/netfilter/xt_set.c359
-rw-r--r--net/netfilter/xt_socket.c13
-rw-r--r--net/netlink/genetlink.c2
-rw-r--r--net/packet/af_packet.c38
-rw-r--r--net/rds/rds.h1
-rw-r--r--net/rfkill/Kconfig4
-rw-r--r--net/rose/af_rose.c7
-rw-r--r--net/rose/rose_route.c28
-rw-r--r--net/sched/Kconfig28
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c46
-rw-r--r--net/sched/act_csum.c2
-rw-r--r--net/sched/act_gact.c8
-rw-r--r--net/sched/act_ipt.c16
-rw-r--r--net/sched/act_mirred.c4
-rw-r--r--net/sched/act_nat.c2
-rw-r--r--net/sched/act_pedit.c10
-rw-r--r--net/sched/act_police.c9
-rw-r--r--net/sched/act_simple.c10
-rw-r--r--net/sched/act_skbedit.c8
-rw-r--r--net/sched/cls_api.c33
-rw-r--r--net/sched/cls_basic.c17
-rw-r--r--net/sched/cls_cgroup.c8
-rw-r--r--net/sched/cls_flow.c6
-rw-r--r--net/sched/cls_fw.c38
-rw-r--r--net/sched/cls_route.c126
-rw-r--r--net/sched/cls_rsvp.h95
-rw-r--r--net/sched/cls_tcindex.c2
-rw-r--r--net/sched/cls_u32.c77
-rw-r--r--net/sched/em_cmp.c47
-rw-r--r--net/sched/em_meta.c44
-rw-r--r--net/sched/em_nbyte.c3
-rw-r--r--net/sched/em_text.c3
-rw-r--r--net/sched/em_u32.c2
-rw-r--r--net/sched/ematch.c37
-rw-r--r--net/sched/sch_api.c169
-rw-r--r--net/sched/sch_atm.c16
-rw-r--r--net/sched/sch_cbq.c365
-rw-r--r--net/sched/sch_choke.c677
-rw-r--r--net/sched/sch_drr.c2
-rw-r--r--net/sched/sch_dsmark.c23
-rw-r--r--net/sched/sch_fifo.c27
-rw-r--r--net/sched/sch_generic.c40
-rw-r--r--net/sched/sch_gred.c85
-rw-r--r--net/sched/sch_hfsc.c39
-rw-r--r--net/sched/sch_htb.c118
-rw-r--r--net/sched/sch_mq.c1
-rw-r--r--net/sched/sch_mqprio.c416
-rw-r--r--net/sched/sch_multiq.c10
-rw-r--r--net/sched/sch_netem.c11
-rw-r--r--net/sched/sch_prio.c36
-rw-r--r--net/sched/sch_red.c72
-rw-r--r--net/sched/sch_sfq.c72
-rw-r--r--net/sched/sch_tbf.c41
-rw-r--r--net/sched/sch_teql.c39
-rw-r--r--net/sctp/sm_make_chunk.c10
-rw-r--r--net/sctp/socket.c13
-rw-r--r--net/sctp/tsnmap.c2
-rw-r--r--net/socket.c23
-rw-r--r--net/sunrpc/svcsock.c36
-rw-r--r--net/unix/af_unix.c68
-rw-r--r--net/wanrouter/wanmain.c2
-rw-r--r--net/wireless/Kconfig2
-rw-r--r--net/x25/x25_facilities.c28
-rw-r--r--net/x25/x25_in.c14
-rw-r--r--net/x25/x25_link.c5
-rw-r--r--net/xfrm/xfrm_policy.c7
-rw-r--r--net/xfrm/xfrm_user.c2
265 files changed, 16630 insertions, 5122 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 6e64f7c6a2e..7850412f52b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -327,7 +327,7 @@ static void vlan_sync_address(struct net_device *dev,
static void vlan_transfer_features(struct net_device *dev,
struct net_device *vlandev)
{
- unsigned long old_features = vlandev->features;
+ u32 old_features = vlandev->features;
vlandev->features &= ~dev->vlan_features;
vlandev->features |= dev->features & dev->vlan_features;
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 17c5ba7551a..29a54ccd213 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -59,7 +59,6 @@
* safely advertise a maxsize
* of 64k */
-#define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT)
/**
* struct p9_trans_rdma - RDMA transport instance
*
diff --git a/net/Kconfig b/net/Kconfig
index 72840626284..79cabf1ee68 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -221,6 +221,12 @@ config RPS
depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
default y
+config RFS_ACCEL
+ boolean
+ depends on RPS && GENERIC_HARDIRQS
+ select CPU_RMAP
+ default y
+
config XPS
boolean
depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index d936aeccd19..2de93d00631 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
#
diff --git a/net/batman-adv/aggregation.c b/net/batman-adv/aggregation.c
index 3850a3ecf94..1997725a243 100644
--- a/net/batman-adv/aggregation.c
+++ b/net/batman-adv/aggregation.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
diff --git a/net/batman-adv/aggregation.h b/net/batman-adv/aggregation.h
index 71a91b3da91..6ce305b4001 100644
--- a/net/batman-adv/aggregation.h
+++ b/net/batman-adv/aggregation.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
diff --git a/net/batman-adv/bat_debugfs.c b/net/batman-adv/bat_debugfs.c
index 0ae81d07f10..0e9d4350993 100644
--- a/net/batman-adv/bat_debugfs.c
+++ b/net/batman-adv/bat_debugfs.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -52,7 +52,6 @@ static void emit_log_char(struct debug_log *debug_log, char c)
static int fdebug_log(struct debug_log *debug_log, char *fmt, ...)
{
- int printed_len;
va_list args;
static char debug_log_buf[256];
char *p;
@@ -62,8 +61,7 @@ static int fdebug_log(struct debug_log *debug_log, char *fmt, ...)
spin_lock_bh(&debug_log->lock);
va_start(args, fmt);
- printed_len = vscnprintf(debug_log_buf, sizeof(debug_log_buf),
- fmt, args);
+ vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args);
va_end(args);
for (p = debug_log_buf; *p != 0; p++)
diff --git a/net/batman-adv/bat_debugfs.h b/net/batman-adv/bat_debugfs.h
index 72df532b7d5..bc9cda3f01e 100644
--- a/net/batman-adv/bat_debugfs.h
+++ b/net/batman-adv/bat_debugfs.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/bat_sysfs.c b/net/batman-adv/bat_sysfs.c
index cd7bb51825f..f7b93a0805f 100644
--- a/net/batman-adv/bat_sysfs.c
+++ b/net/batman-adv/bat_sysfs.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/bat_sysfs.h b/net/batman-adv/bat_sysfs.h
index 7f186c007b4..02f1fa7aadf 100644
--- a/net/batman-adv/bat_sysfs.h
+++ b/net/batman-adv/bat_sysfs.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index bbcd8f744cd..ad2ca925b3e 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index ac54017601b..769c246d1fc 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 0065ffb8d96..429a013d2e0 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 4585e654984..2aa439124ee 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index b962982f017..50d3a59a3d7 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 5e728d0b795..55e527a489f 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 4f95777ce08..f2131f45aa9 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -34,6 +34,12 @@
/* protect update critical side of if_list - but not the content */
static DEFINE_SPINLOCK(if_list_lock);
+
+static int batman_skb_recv(struct sk_buff *skb,
+ struct net_device *dev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev);
+
static void hardif_free_rcu(struct rcu_head *rcu)
{
struct batman_if *batman_if;
@@ -549,8 +555,9 @@ out:
/* receive a packet with the batman ethertype coming on a hard
* interface */
-int batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *ptype, struct net_device *orig_dev)
+static int batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev)
{
struct bat_priv *bat_priv;
struct batman_packet *batman_packet;
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 30ec3b8db45..ad195438428 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -35,10 +35,6 @@ struct batman_if *get_batman_if_by_netdev(struct net_device *net_dev);
int hardif_enable_interface(struct batman_if *batman_if, char *iface_name);
void hardif_disable_interface(struct batman_if *batman_if);
void hardif_remove_interfaces(void);
-int batman_skb_recv(struct sk_buff *skb,
- struct net_device *dev,
- struct packet_type *ptype,
- struct net_device *orig_dev);
int hardif_min_mtu(struct net_device *soft_iface);
void update_min_mtu(struct net_device *soft_iface);
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index 26e623eb9de..fa2693973ab 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 09216ade16f..eae24402fd0 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -49,11 +49,6 @@ struct hashtable_t {
/* allocates and clears the hash */
struct hashtable_t *hash_new(int size);
-/* remove element if you already found the element you want to delete and don't
- * need the overhead to find it again with hash_remove(). But usually, you
- * don't want to use this function, as it fiddles with hash-internals. */
-void *hash_remove_element(struct hashtable_t *hash, struct element_t *elem);
-
/* free only the hashtable and the hash itself. */
void hash_destroy(struct hashtable_t *hash);
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index ecf6d7ffab2..319a7ccf6ef 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -24,7 +24,6 @@
#include <linux/slab.h>
#include "icmp_socket.h"
#include "send.h"
-#include "types.h"
#include "hash.h"
#include "originator.h"
#include "hard-interface.h"
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index bf9b348cde2..462b190fa10 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -22,8 +22,6 @@
#ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_
#define _NET_BATMAN_ADV_ICMP_SOCKET_H_
-#include "types.h"
-
#define ICMP_SOCKET "socket"
void bat_socket_init(void);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index b827f6a158c..06d956c91c2 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -30,7 +30,6 @@
#include "translation-table.h"
#include "hard-interface.h"
#include "gateway_client.h"
-#include "types.h"
#include "vis.h"
#include "hash.h"
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index d4d9926c220..e235d7bbe04 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,9 +22,6 @@
#ifndef _NET_BATMAN_ADV_MAIN_H_
#define _NET_BATMAN_ADV_MAIN_H_
-/* Kernel Programming */
-#define LINUX
-
#define DRIVER_AUTHOR "Marek Lindner <lindner_marek@yahoo.de>, " \
"Simon Wunderlich <siwu@hrz.tu-chemnitz.de>"
#define DRIVER_DESC "B.A.T.M.A.N. advanced"
@@ -54,7 +51,6 @@
#define NUM_WORDS (TQ_LOCAL_WINDOW_SIZE / WORD_BIT_SIZE)
-#define PACKBUFF_SIZE 2000
#define LOG_BUF_LEN 8192 /* has to be a power of 2 */
#define VIS_INTERVAL 5000 /* 5 seconds */
@@ -96,15 +92,11 @@
#define DBG_ROUTES 2 /* route or hna added / changed / deleted */
#define DBG_ALL 3
-#define LOG_BUF_LEN 8192 /* has to be a power of 2 */
-
/*
* Vis
*/
-/* #define VIS_SUBCLUSTERS_DISABLED */
-
/*
* Kernel headers
*/
@@ -151,20 +143,13 @@ int debug_log(struct bat_priv *bat_priv, char *fmt, ...);
} \
while (0)
#else /* !CONFIG_BATMAN_ADV_DEBUG */
-static inline void bat_dbg(char type __attribute__((unused)),
- struct bat_priv *bat_priv __attribute__((unused)),
- char *fmt __attribute__((unused)), ...)
+static inline void bat_dbg(char type __always_unused,
+ struct bat_priv *bat_priv __always_unused,
+ char *fmt __always_unused, ...)
{
}
#endif
-#define bat_warning(net_dev, fmt, arg...) \
- do { \
- struct net_device *_netdev = (net_dev); \
- struct bat_priv *_batpriv = netdev_priv(_netdev); \
- bat_dbg(DBG_ALL, _batpriv, fmt, ## arg); \
- pr_warning("%s: " fmt, _netdev->name, ## arg); \
- } while (0)
#define bat_info(net_dev, fmt, arg...) \
do { \
struct net_device *_netdev = (net_dev); \
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 6b7fb6b7e6f..54863c9385d 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -247,7 +247,7 @@ static bool purge_orig_node(struct bat_priv *bat_priv,
orig_node->hna_buff_len);
/* update bonding candidates, we could have lost
* some candidates. */
- update_bonding_candidates(bat_priv, orig_node);
+ update_bonding_candidates(orig_node);
}
}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index d474ceb2a4e..8019fbddffd 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index b49fdf70a6d..e7571879af3 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -50,6 +50,7 @@
/* fragmentation defines */
#define UNI_FRAG_HEAD 0x01
+#define UNI_FRAG_LARGETAIL 0x02
struct batman_packet {
uint8_t packet_type;
@@ -63,7 +64,7 @@ struct batman_packet {
uint8_t num_hna;
uint8_t gw_flags; /* flags related to gateway class */
uint8_t align;
-} __attribute__((packed));
+} __packed;
#define BAT_PACKET_LEN sizeof(struct batman_packet)
@@ -76,7 +77,7 @@ struct icmp_packet {
uint8_t orig[6];
uint16_t seqno;
uint8_t uid;
-} __attribute__((packed));
+} __packed;
#define BAT_RR_LEN 16
@@ -93,14 +94,14 @@ struct icmp_packet_rr {
uint8_t uid;
uint8_t rr_cur;
uint8_t rr[BAT_RR_LEN][ETH_ALEN];
-} __attribute__((packed));
+} __packed;
struct unicast_packet {
uint8_t packet_type;
uint8_t version; /* batman version field */
uint8_t dest[6];
uint8_t ttl;
-} __attribute__((packed));
+} __packed;
struct unicast_frag_packet {
uint8_t packet_type;
@@ -110,7 +111,7 @@ struct unicast_frag_packet {
uint8_t flags;
uint8_t orig[6];
uint16_t seqno;
-} __attribute__((packed));
+} __packed;
struct bcast_packet {
uint8_t packet_type;
@@ -118,7 +119,7 @@ struct bcast_packet {
uint8_t orig[6];
uint8_t ttl;
uint32_t seqno;
-} __attribute__((packed));
+} __packed;
struct vis_packet {
uint8_t packet_type;
@@ -131,6 +132,6 @@ struct vis_packet {
* neighbors */
uint8_t target_orig[6]; /* who should receive this packet */
uint8_t sender_orig[6]; /* who sent or rebroadcasted this packet */
-} __attribute__((packed));
+} __packed;
#endif /* _NET_BATMAN_ADV_PACKET_H_ */
diff --git a/net/batman-adv/ring_buffer.c b/net/batman-adv/ring_buffer.c
index defd37c9be1..5bb6a619afe 100644
--- a/net/batman-adv/ring_buffer.c
+++ b/net/batman-adv/ring_buffer.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/ring_buffer.h b/net/batman-adv/ring_buffer.h
index 6b0cb9aaeba..0395b274186 100644
--- a/net/batman-adv/ring_buffer.h
+++ b/net/batman-adv/ring_buffer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 8828eddd3f7..827414067e4 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -28,7 +28,6 @@
#include "icmp_socket.h"
#include "translation-table.h"
#include "originator.h"
-#include "types.h"
#include "ring_buffer.h"
#include "vis.h"
#include "aggregation.h"
@@ -433,8 +432,7 @@ static char count_real_packets(struct ethhdr *ethhdr,
}
/* copy primary address for bonding */
-static void mark_bonding_address(struct bat_priv *bat_priv,
- struct orig_node *orig_node,
+static void mark_bonding_address(struct orig_node *orig_node,
struct orig_node *orig_neigh_node,
struct batman_packet *batman_packet)
@@ -447,8 +445,7 @@ static void mark_bonding_address(struct bat_priv *bat_priv,
}
/* mark possible bond.candidates in the neighbor list */
-void update_bonding_candidates(struct bat_priv *bat_priv,
- struct orig_node *orig_node)
+void update_bonding_candidates(struct orig_node *orig_node)
{
int candidates;
int interference_candidate;
@@ -730,9 +727,8 @@ void receive_bat_packet(struct ethhdr *ethhdr,
update_orig(bat_priv, orig_node, ethhdr, batman_packet,
if_incoming, hna_buff, hna_buff_len, is_duplicate);
- mark_bonding_address(bat_priv, orig_node,
- orig_neigh_node, batman_packet);
- update_bonding_candidates(bat_priv, orig_node);
+ mark_bonding_address(orig_node, orig_neigh_node, batman_packet);
+ update_bonding_candidates(orig_node);
/* is single hop (direct) neighbor */
if (is_single_hop_neigh) {
@@ -810,13 +806,11 @@ static int recv_my_icmp_packet(struct bat_priv *bat_priv,
{
struct orig_node *orig_node;
struct icmp_packet_rr *icmp_packet;
- struct ethhdr *ethhdr;
struct batman_if *batman_if;
int ret;
uint8_t dstaddr[ETH_ALEN];
icmp_packet = (struct icmp_packet_rr *)skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
/* add data to device queue */
if (icmp_packet->msg_type != ECHO_REQUEST) {
@@ -848,7 +842,6 @@ static int recv_my_icmp_packet(struct bat_priv *bat_priv,
return NET_RX_DROP;
icmp_packet = (struct icmp_packet_rr *)skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN);
memcpy(icmp_packet->orig,
@@ -866,17 +859,15 @@ static int recv_my_icmp_packet(struct bat_priv *bat_priv,
}
static int recv_icmp_ttl_exceeded(struct bat_priv *bat_priv,
- struct sk_buff *skb, size_t icmp_len)
+ struct sk_buff *skb)
{
struct orig_node *orig_node;
struct icmp_packet *icmp_packet;
- struct ethhdr *ethhdr;
struct batman_if *batman_if;
int ret;
uint8_t dstaddr[ETH_ALEN];
icmp_packet = (struct icmp_packet *)skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
/* send TTL exceeded if packet is an echo request (traceroute) */
if (icmp_packet->msg_type != ECHO_REQUEST) {
@@ -909,7 +900,6 @@ static int recv_icmp_ttl_exceeded(struct bat_priv *bat_priv,
return NET_RX_DROP;
icmp_packet = (struct icmp_packet *) skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN);
memcpy(icmp_packet->orig,
@@ -978,7 +968,7 @@ int recv_icmp_packet(struct sk_buff *skb, struct batman_if *recv_if)
/* TTL exceeded */
if (icmp_packet->ttl < 2)
- return recv_icmp_ttl_exceeded(bat_priv, skb, hdr_size);
+ return recv_icmp_ttl_exceeded(bat_priv, skb);
ret = NET_RX_DROP;
@@ -1001,7 +991,6 @@ int recv_icmp_packet(struct sk_buff *skb, struct batman_if *recv_if)
return NET_RX_DROP;
icmp_packet = (struct icmp_packet_rr *)skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
/* decrement ttl */
icmp_packet->ttl--;
@@ -1193,7 +1182,7 @@ int route_unicast_packet(struct sk_buff *skb, struct batman_if *recv_if,
dstaddr);
if (unicast_packet->packet_type == BAT_UNICAST_FRAG &&
- 2 * skb->len - hdr_size <= batman_if->net_dev->mtu) {
+ frag_can_reassemble(skb, batman_if->net_dev->mtu)) {
ret = frag_reassemble_skb(skb, bat_priv, &new_skb);
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index f108f230bfd..a09d16f0c3a 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,8 +22,6 @@
#ifndef _NET_BATMAN_ADV_ROUTING_H_
#define _NET_BATMAN_ADV_ROUTING_H_
-#include "types.h"
-
void slide_own_bcast_window(struct batman_if *batman_if);
void receive_bat_packet(struct ethhdr *ethhdr,
struct batman_packet *batman_packet,
@@ -42,7 +40,6 @@ int recv_vis_packet(struct sk_buff *skb, struct batman_if *recv_if);
int recv_bat_packet(struct sk_buff *skb, struct batman_if *recv_if);
struct neigh_node *find_router(struct bat_priv *bat_priv,
struct orig_node *orig_node, struct batman_if *recv_if);
-void update_bonding_candidates(struct bat_priv *bat_priv,
- struct orig_node *orig_node);
+void update_bonding_candidates(struct orig_node *orig_node);
#endif /* _NET_BATMAN_ADV_ROUTING_H_ */
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index b89b9f7709a..831427694fc 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -25,7 +25,6 @@
#include "translation-table.h"
#include "soft-interface.h"
#include "hard-interface.h"
-#include "types.h"
#include "vis.h"
#include "aggregation.h"
#include "gateway_common.h"
@@ -49,7 +48,7 @@ static unsigned long own_send_time(struct bat_priv *bat_priv)
}
/* when do we schedule a forwarded packet to be sent */
-static unsigned long forward_send_time(struct bat_priv *bat_priv)
+static unsigned long forward_send_time(void)
{
return jiffies + msecs_to_jiffies(random32() % (JITTER/2));
}
@@ -356,7 +355,7 @@ void schedule_forward_packet(struct orig_node *orig_node,
else
batman_packet->flags &= ~DIRECTLINK;
- send_time = forward_send_time(bat_priv);
+ send_time = forward_send_time();
add_bat_packet_to_list(bat_priv,
(unsigned char *)batman_packet,
sizeof(struct batman_packet) + hna_buff_len,
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index c4cefa8e4f8..b68c272cb84 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,8 +22,6 @@
#ifndef _NET_BATMAN_ADV_SEND_H_
#define _NET_BATMAN_ADV_SEND_H_
-#include "types.h"
-
int send_skb_packet(struct sk_buff *skb,
struct batman_if *batman_if,
uint8_t *dst_addr);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index e89ede192ed..bd088f877e3 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -26,7 +26,6 @@
#include "send.h"
#include "bat_debugfs.h"
#include "translation-table.h"
-#include "types.h"
#include "hash.h"
#include "gateway_common.h"
#include "gateway_client.h"
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 02b77334d10..e7b0e1a34a5 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index a633b5a435e..7fb6726ccbd 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,7 +22,6 @@
#include "main.h"
#include "translation-table.h"
#include "soft-interface.h"
-#include "types.h"
#include "hash.h"
#include "originator.h"
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 10c4c5c319b..f19931ca145 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,8 +22,6 @@
#ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
#define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
-#include "types.h"
-
int hna_local_init(struct bat_priv *bat_priv);
void hna_local_add(struct net_device *soft_iface, uint8_t *addr);
void hna_local_remove(struct bat_priv *bat_priv,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 97cb23dd3e6..7270405046e 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -246,13 +246,13 @@ struct vis_info {
/* this packet might be part of the vis send queue. */
struct sk_buff *skb_packet;
/* vis_info may follow here*/
-} __attribute__((packed));
+} __packed;
struct vis_info_entry {
uint8_t src[ETH_ALEN];
uint8_t dest[ETH_ALEN];
uint8_t quality; /* quality = 0 means HNA */
-} __attribute__((packed));
+} __packed;
struct recvlist_node {
struct list_head list;
diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c
index dc2e28bed84..121b11d2a23 100644
--- a/net/batman-adv/unicast.c
+++ b/net/batman-adv/unicast.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Andreas Langer
*
@@ -39,8 +39,8 @@ static struct sk_buff *frag_merge_packet(struct list_head *head,
(struct unicast_frag_packet *)skb->data;
struct sk_buff *tmp_skb;
struct unicast_packet *unicast_packet;
- int hdr_len = sizeof(struct unicast_packet),
- uni_diff = sizeof(struct unicast_frag_packet) - hdr_len;
+ int hdr_len = sizeof(struct unicast_packet);
+ int uni_diff = sizeof(struct unicast_frag_packet) - hdr_len;
/* set skb to the first part and tmp_skb to the second part */
if (up->flags & UNI_FRAG_HEAD) {
@@ -50,12 +50,12 @@ static struct sk_buff *frag_merge_packet(struct list_head *head,
skb = tfp->skb;
}
+ if (skb_linearize(skb) < 0 || skb_linearize(tmp_skb) < 0)
+ goto err;
+
skb_pull(tmp_skb, sizeof(struct unicast_frag_packet));
- if (pskb_expand_head(skb, 0, tmp_skb->len, GFP_ATOMIC) < 0) {
- /* free buffered skb, skb will be freed later */
- kfree_skb(tfp->skb);
- return NULL;
- }
+ if (pskb_expand_head(skb, 0, tmp_skb->len, GFP_ATOMIC) < 0)
+ goto err;
/* move free entry to end */
tfp->skb = NULL;
@@ -70,6 +70,11 @@ static struct sk_buff *frag_merge_packet(struct list_head *head,
unicast_packet->packet_type = BAT_UNICAST;
return skb;
+
+err:
+ /* free buffered skb, skb will be freed later */
+ kfree_skb(tfp->skb);
+ return NULL;
}
static void frag_create_entry(struct list_head *head, struct sk_buff *skb)
@@ -224,16 +229,21 @@ int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
struct unicast_frag_packet *frag1, *frag2;
int uc_hdr_len = sizeof(struct unicast_packet);
int ucf_hdr_len = sizeof(struct unicast_frag_packet);
- int data_len = skb->len;
+ int data_len = skb->len - uc_hdr_len;
+ int large_tail = 0;
+ uint16_t seqno;
if (!bat_priv->primary_if)
goto dropped;
- unicast_packet = (struct unicast_packet *) skb->data;
+ frag_skb = dev_alloc_skb(data_len - (data_len / 2) + ucf_hdr_len);
+ if (!frag_skb)
+ goto dropped;
+ skb_reserve(frag_skb, ucf_hdr_len);
+ unicast_packet = (struct unicast_packet *) skb->data;
memcpy(&tmp_uc, unicast_packet, uc_hdr_len);
- frag_skb = dev_alloc_skb(data_len - (data_len / 2) + ucf_hdr_len);
- skb_split(skb, frag_skb, data_len / 2);
+ skb_split(skb, frag_skb, data_len / 2 + uc_hdr_len);
if (my_skb_head_push(skb, ucf_hdr_len - uc_hdr_len) < 0 ||
my_skb_head_push(frag_skb, ucf_hdr_len) < 0)
@@ -251,13 +261,15 @@ int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
memcpy(frag1->orig, bat_priv->primary_if->net_dev->dev_addr, ETH_ALEN);
memcpy(frag2, frag1, sizeof(struct unicast_frag_packet));
- frag1->flags |= UNI_FRAG_HEAD;
- frag2->flags &= ~UNI_FRAG_HEAD;
+ if (data_len & 1)
+ large_tail = UNI_FRAG_LARGETAIL;
- frag1->seqno = htons((uint16_t)atomic_inc_return(
- &batman_if->frag_seqno));
- frag2->seqno = htons((uint16_t)atomic_inc_return(
- &batman_if->frag_seqno));
+ frag1->flags = UNI_FRAG_HEAD | large_tail;
+ frag2->flags = large_tail;
+
+ seqno = atomic_add_return(2, &batman_if->frag_seqno);
+ frag1->seqno = htons(seqno - 1);
+ frag2->seqno = htons(seqno);
send_skb_packet(skb, batman_if, dstaddr);
send_skb_packet(frag_skb, batman_if, dstaddr);
@@ -274,7 +286,7 @@ int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv)
{
struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
struct unicast_packet *unicast_packet;
- struct orig_node *orig_node;
+ struct orig_node *orig_node = NULL;
struct batman_if *batman_if;
struct neigh_node *router;
int data_len = skb->len;
@@ -285,11 +297,6 @@ int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv)
/* get routing information */
if (is_multicast_ether_addr(ethhdr->h_dest))
orig_node = (struct orig_node *)gw_get_selected(bat_priv);
- else
- orig_node = ((struct orig_node *)hash_find(bat_priv->orig_hash,
- compare_orig,
- choose_orig,
- ethhdr->h_dest));
/* check for hna host */
if (!orig_node)
diff --git a/net/batman-adv/unicast.h b/net/batman-adv/unicast.h
index e32b7867a9a..8897308281d 100644
--- a/net/batman-adv/unicast.h
+++ b/net/batman-adv/unicast.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Andreas Langer
*
@@ -22,6 +22,8 @@
#ifndef _NET_BATMAN_ADV_UNICAST_H_
#define _NET_BATMAN_ADV_UNICAST_H_
+#include "packet.h"
+
#define FRAG_TIMEOUT 10000 /* purge frag list entrys after time in ms */
#define FRAG_BUFFER_SIZE 6 /* number of list elements in buffer */
@@ -32,4 +34,25 @@ int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv);
int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
struct batman_if *batman_if, uint8_t dstaddr[]);
+static inline int frag_can_reassemble(struct sk_buff *skb, int mtu)
+{
+ struct unicast_frag_packet *unicast_packet;
+ int uneven_correction = 0;
+ unsigned int merged_size;
+
+ unicast_packet = (struct unicast_frag_packet *)skb->data;
+
+ if (unicast_packet->flags & UNI_FRAG_LARGETAIL) {
+ if (unicast_packet->flags & UNI_FRAG_HEAD)
+ uneven_correction = 1;
+ else
+ uneven_correction = -1;
+ }
+
+ merged_size = (skb->len - sizeof(struct unicast_frag_packet)) * 2;
+ merged_size += sizeof(struct unicast_packet) + uneven_correction;
+
+ return merged_size <= mtu;
+}
+
#endif /* _NET_BATMAN_ADV_UNICAST_H_ */
diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c
index cd4c4231fa4..7db9ad82cc0 100644
--- a/net/batman-adv/vis.c
+++ b/net/batman-adv/vis.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2008-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*
@@ -64,6 +64,7 @@ static void free_info(struct kref *ref)
spin_unlock_bh(&bat_priv->vis_list_lock);
kfree_skb(info->skb_packet);
+ kfree(info);
}
/* Compare two vis packets, used by the hashing algorithm */
@@ -268,10 +269,10 @@ int vis_seq_print_text(struct seq_file *seq, void *offset)
buff_pos += sprintf(buff + buff_pos, "%pM,",
entry->addr);
- for (i = 0; i < packet->entries; i++)
+ for (j = 0; j < packet->entries; j++)
buff_pos += vis_data_read_entry(
buff + buff_pos,
- &entries[i],
+ &entries[j],
entry->addr,
entry->primary);
@@ -444,7 +445,7 @@ static struct vis_info *add_packet(struct bat_priv *bat_priv,
info);
if (hash_added < 0) {
/* did not work (for some reason) */
- kref_put(&old_info->refcount, free_info);
+ kref_put(&info->refcount, free_info);
info = NULL;
}
@@ -815,7 +816,7 @@ static void send_vis_packets(struct work_struct *work)
container_of(work, struct delayed_work, work);
struct bat_priv *bat_priv =
container_of(delayed_work, struct bat_priv, vis_work);
- struct vis_info *info, *temp;
+ struct vis_info *info;
spin_lock_bh(&bat_priv->vis_hash_lock);
purge_vis_packets(bat_priv);
@@ -825,8 +826,9 @@ static void send_vis_packets(struct work_struct *work)
send_list_add(bat_priv, bat_priv->my_vis_info);
}
- list_for_each_entry_safe(info, temp, &bat_priv->vis_send_list,
- send_list) {
+ while (!list_empty(&bat_priv->vis_send_list)) {
+ info = list_first_entry(&bat_priv->vis_send_list,
+ typeof(*info), send_list);
kref_get(&info->refcount);
spin_unlock_bh(&bat_priv->vis_hash_lock);
diff --git a/net/batman-adv/vis.h b/net/batman-adv/vis.h
index 2c3b33089a9..31b820d07f2 100644
--- a/net/batman-adv/vis.h
+++ b/net/batman-adv/vis.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2008-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 556443566e9..1461b19efd3 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -297,6 +297,21 @@ void br_netpoll_disable(struct net_bridge_port *p)
#endif
+static int br_add_slave(struct net_device *dev, struct net_device *slave_dev)
+
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_add_if(br, slave_dev);
+}
+
+static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_del_if(br, slave_dev);
+}
+
static const struct ethtool_ops br_ethtool_ops = {
.get_drvinfo = br_getinfo,
.get_link = ethtool_op_get_link,
@@ -326,6 +341,8 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_netpoll_cleanup = br_netpoll_cleanup,
.ndo_poll_controller = br_poll_controller,
#endif
+ .ndo_add_slave = br_add_slave,
+ .ndo_del_slave = br_del_slave,
};
static void br_dev_free(struct net_device *dev)
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 2872393b293..88485cc74dc 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -328,12 +328,12 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
if (fdb) {
memcpy(fdb->addr.addr, addr, ETH_ALEN);
- hlist_add_head_rcu(&fdb->hlist, head);
-
fdb->dst = source;
fdb->is_local = is_local;
fdb->is_static = is_local;
fdb->ageing_timer = jiffies;
+
+ hlist_add_head_rcu(&fdb->hlist, head);
}
return fdb;
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index d9d1e2bac1d..dce8f0009a1 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -148,6 +148,8 @@ static void del_nbp(struct net_bridge_port *p)
netdev_rx_handler_unregister(dev);
+ netdev_set_master(dev, NULL);
+
br_multicast_del_port(p);
kobject_uevent(&p->kobj, KOBJ_REMOVE);
@@ -365,7 +367,7 @@ int br_min_mtu(const struct net_bridge *br)
void br_features_recompute(struct net_bridge *br)
{
struct net_bridge_port *p;
- unsigned long features, mask;
+ u32 features, mask;
features = mask = br->feature_mask;
if (list_empty(&br->port_list))
@@ -379,7 +381,7 @@ void br_features_recompute(struct net_bridge *br)
}
done:
- br->dev->features = netdev_fix_features(features, NULL);
+ br->dev->features = netdev_fix_features(br->dev, features);
}
/* called with RTNL */
@@ -429,10 +431,14 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if (br_netpoll_info(br) && ((err = br_netpoll_enable(p))))
goto err3;
- err = netdev_rx_handler_register(dev, br_handle_frame, p);
+ err = netdev_set_master(dev, br->dev);
if (err)
goto err3;
+ err = netdev_rx_handler_register(dev, br_handle_frame, p);
+ if (err)
+ goto err4;
+
dev->priv_flags |= IFF_BRIDGE_PORT;
dev_disable_lro(dev);
@@ -455,6 +461,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
+
+err4:
+ netdev_set_master(dev, NULL);
err3:
sysfs_remove_link(br->ifobj, p->dev->name);
err2:
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 6f6d8e1b776..88e4aa9cb1f 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -80,7 +80,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
if (is_multicast_ether_addr(dest)) {
mdst = br_mdb_get(br, skb);
if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
- if ((mdst && !hlist_unhashed(&mdst->mglist)) ||
+ if ((mdst && mdst->mglist) ||
br_multicast_is_router(br))
skb2 = skb;
br_multicast_forward(mdst, skb, skb2);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f701a21acb3..09d5c098792 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -232,8 +232,7 @@ static void br_multicast_group_expired(unsigned long data)
if (!netif_running(br->dev) || timer_pending(&mp->timer))
goto out;
- if (!hlist_unhashed(&mp->mglist))
- hlist_del_init(&mp->mglist);
+ mp->mglist = false;
if (mp->ports)
goto out;
@@ -276,7 +275,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
del_timer(&p->query_timer);
call_rcu_bh(&p->rcu, br_multicast_free_pg);
- if (!mp->ports && hlist_unhashed(&mp->mglist) &&
+ if (!mp->ports && !mp->mglist &&
netif_running(br->dev))
mod_timer(&mp->timer, jiffies);
@@ -528,7 +527,7 @@ static void br_multicast_group_query_expired(unsigned long data)
struct net_bridge *br = mp->br;
spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) || hlist_unhashed(&mp->mglist) ||
+ if (!netif_running(br->dev) || !mp->mglist ||
mp->queries_sent >= br->multicast_last_member_count)
goto out;
@@ -719,7 +718,7 @@ static int br_multicast_add_group(struct net_bridge *br,
goto err;
if (!port) {
- hlist_add_head(&mp->mglist, &br->mglist);
+ mp->mglist = true;
mod_timer(&mp->timer, now + br->multicast_membership_interval);
goto out;
}
@@ -1165,7 +1164,7 @@ static int br_ip4_multicast_query(struct net_bridge *br,
max_delay *= br->multicast_last_member_count;
- if (!hlist_unhashed(&mp->mglist) &&
+ if (mp->mglist &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, now + max_delay) :
try_to_del_timer_sync(&mp->timer) >= 0))
@@ -1177,7 +1176,7 @@ static int br_ip4_multicast_query(struct net_bridge *br,
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
try_to_del_timer_sync(&p->timer) >= 0)
- mod_timer(&mp->timer, now + max_delay);
+ mod_timer(&p->timer, now + max_delay);
}
out:
@@ -1236,7 +1235,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
goto out;
max_delay *= br->multicast_last_member_count;
- if (!hlist_unhashed(&mp->mglist) &&
+ if (mp->mglist &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, now + max_delay) :
try_to_del_timer_sync(&mp->timer) >= 0))
@@ -1248,7 +1247,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
try_to_del_timer_sync(&p->timer) >= 0)
- mod_timer(&mp->timer, now + max_delay);
+ mod_timer(&p->timer, now + max_delay);
}
out:
@@ -1283,7 +1282,7 @@ static void br_multicast_leave_group(struct net_bridge *br,
br->multicast_last_member_interval;
if (!port) {
- if (!hlist_unhashed(&mp->mglist) &&
+ if (mp->mglist &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, time) :
try_to_del_timer_sync(&mp->timer) >= 0)) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 84aac7734bf..f7afc364d77 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -84,13 +84,13 @@ struct net_bridge_port_group {
struct net_bridge_mdb_entry
{
struct hlist_node hlist[2];
- struct hlist_node mglist;
struct net_bridge *br;
struct net_bridge_port_group __rcu *ports;
struct rcu_head rcu;
struct timer_list timer;
struct timer_list query_timer;
struct br_ip addr;
+ bool mglist;
u32 queries_sent;
};
@@ -182,7 +182,7 @@ struct net_bridge
struct br_cpu_netstats __percpu *stats;
spinlock_t hash_lock;
struct hlist_head hash[BR_HASH_SIZE];
- unsigned long feature_mask;
+ u32 feature_mask;
#ifdef CONFIG_BRIDGE_NETFILTER
struct rtable fake_rtable;
bool nf_call_iptables;
@@ -238,7 +238,6 @@ struct net_bridge
spinlock_t multicast_lock;
struct net_bridge_mdb_htable __rcu *mdb;
struct hlist_head router_list;
- struct hlist_head mglist;
struct timer_list multicast_router_timer;
struct timer_list multicast_querier_timer;
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 50a46afc2bc..2ed0056a39a 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -22,9 +22,15 @@
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_bridge/ebt_ip6.h>
-struct tcpudphdr {
- __be16 src;
- __be16 dst;
+union pkthdr {
+ struct {
+ __be16 src;
+ __be16 dst;
+ } tcpudphdr;
+ struct {
+ u8 type;
+ u8 code;
+ } icmphdr;
};
static bool
@@ -33,8 +39,8 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct ebt_ip6_info *info = par->matchinfo;
const struct ipv6hdr *ih6;
struct ipv6hdr _ip6h;
- const struct tcpudphdr *pptr;
- struct tcpudphdr _ports;
+ const union pkthdr *pptr;
+ union pkthdr _pkthdr;
ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h);
if (ih6 == NULL)
@@ -56,26 +62,34 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
return false;
if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO))
return false;
- if (!(info->bitmask & EBT_IP6_DPORT) &&
- !(info->bitmask & EBT_IP6_SPORT))
+ if (!(info->bitmask & ( EBT_IP6_DPORT |
+ EBT_IP6_SPORT | EBT_IP6_ICMP6)))
return true;
- pptr = skb_header_pointer(skb, offset_ph, sizeof(_ports),
- &_ports);
+
+ /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */
+ pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr),
+ &_pkthdr);
if (pptr == NULL)
return false;
if (info->bitmask & EBT_IP6_DPORT) {
- u32 dst = ntohs(pptr->dst);
+ u16 dst = ntohs(pptr->tcpudphdr.dst);
if (FWINV(dst < info->dport[0] ||
dst > info->dport[1], EBT_IP6_DPORT))
return false;
}
if (info->bitmask & EBT_IP6_SPORT) {
- u32 src = ntohs(pptr->src);
+ u16 src = ntohs(pptr->tcpudphdr.src);
if (FWINV(src < info->sport[0] ||
src > info->sport[1], EBT_IP6_SPORT))
return false;
}
- return true;
+ if ((info->bitmask & EBT_IP6_ICMP6) &&
+ FWINV(pptr->icmphdr.type < info->icmpv6_type[0] ||
+ pptr->icmphdr.type > info->icmpv6_type[1] ||
+ pptr->icmphdr.code < info->icmpv6_code[0] ||
+ pptr->icmphdr.code > info->icmpv6_code[1],
+ EBT_IP6_ICMP6))
+ return false;
}
return true;
}
@@ -103,6 +117,14 @@ static int ebt_ip6_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1])
return -EINVAL;
+ if (info->bitmask & EBT_IP6_ICMP6) {
+ if ((info->invflags & EBT_IP6_PROTO) ||
+ info->protocol != IPPROTO_ICMPV6)
+ return -EINVAL;
+ if (info->icmpv6_type[0] > info->icmpv6_type[1] ||
+ info->icmpv6_code[0] > info->icmpv6_code[1])
+ return -EINVAL;
+ }
return 0;
}
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 16df0532d4b..5f1825df9dc 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1764,6 +1764,7 @@ static int compat_table_info(const struct ebt_table_info *info,
newinfo->entries_size = size;
+ xt_compat_init_offsets(AF_INET, info->nentries);
return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
entries, newinfo);
}
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index 21ede141018..f1f98d967d8 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -23,10 +23,8 @@
#include <asm/atomic.h>
#define MAX_PHY_LAYERS 7
-#define PHY_NAME_LEN 20
#define container_obj(layr) container_of(layr, struct cfcnfg, layer)
-#define RFM_FRAGMENT_SIZE 4030
/* Information about CAIF physical interfaces held by Config Module in order
* to manage physical interfaces
@@ -191,6 +189,7 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
struct cflayer *servl = NULL;
struct cfcnfg_phyinfo *phyinfo = NULL;
u8 phyid = 0;
+
caif_assert(adap_layer != NULL);
channel_id = adap_layer->id;
if (adap_layer->dn == NULL || channel_id == 0) {
@@ -199,16 +198,16 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
goto end;
}
servl = cfmuxl_remove_uplayer(cnfg->mux, channel_id);
- if (servl == NULL)
- goto end;
- layer_set_up(servl, NULL);
- ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer);
if (servl == NULL) {
pr_err("PROTOCOL ERROR - Error removing service_layer Channel_Id(%d)",
channel_id);
ret = -EINVAL;
goto end;
}
+ layer_set_up(servl, NULL);
+ ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer);
+ if (ret)
+ goto end;
caif_assert(channel_id == servl->id);
if (adap_layer->dn != NULL) {
phyid = cfsrvl_getphyid(adap_layer->dn);
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
index d3ed264ad6c..27dab26ad3b 100644
--- a/net/caif/cfdgml.c
+++ b/net/caif/cfdgml.c
@@ -18,7 +18,6 @@
#define DGM_CMD_BIT 0x80
#define DGM_FLOW_OFF 0x81
#define DGM_FLOW_ON 0x80
-#define DGM_CTRL_PKT_SIZE 1
#define DGM_MTU 1500
static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
index 9297f7dea9d..8303fe3ebf8 100644
--- a/net/caif/cfserl.c
+++ b/net/caif/cfserl.c
@@ -25,7 +25,6 @@ struct cfserl {
spinlock_t sync;
bool usestx;
};
-#define STXLEN(layr) (layr->usestx ? 1 : 0)
static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt);
static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
index efad410e4c8..315c0d60136 100644
--- a/net/caif/cfutill.c
+++ b/net/caif/cfutill.c
@@ -20,7 +20,7 @@
#define UTIL_REMOTE_SHUTDOWN 0x82
#define UTIL_FLOW_OFF 0x81
#define UTIL_FLOW_ON 0x80
-#define UTIL_CTRL_PKT_SIZE 1
+
static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt);
static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
index 3b425b189a9..c3b1dec4acf 100644
--- a/net/caif/cfveil.c
+++ b/net/caif/cfveil.c
@@ -17,7 +17,7 @@
#define VEI_FLOW_OFF 0x81
#define VEI_FLOW_ON 0x80
#define VEI_SET_PIN 0x82
-#define VEI_CTRL_PKT_SIZE 1
+
#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index fa9dab372b6..6008d6dc18a 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -394,9 +394,7 @@ static void ipcaif_net_setup(struct net_device *dev)
priv->conn_req.sockaddr.u.dgm.connection_id = -1;
priv->flowenabled = false;
- ASSERT_RTNL();
init_waitqueue_head(&priv->netmgmt_wq);
- list_add(&priv->list_field, &chnl_net_list);
}
@@ -453,6 +451,8 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
ret = register_netdevice(dev);
if (ret)
pr_warn("device rtml registration failed\n");
+ else
+ list_add(&caifdev->list_field, &chnl_net_list);
return ret;
}
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 9d5e8accfab..092dc88a7c6 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1256,6 +1256,9 @@ static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock,
struct sockaddr_can *addr =
(struct sockaddr_can *)msg->msg_name;
+ if (msg->msg_namelen < sizeof(*addr))
+ return -EINVAL;
+
if (addr->can_family != AF_CAN)
return -EINVAL;
diff --git a/net/can/raw.c b/net/can/raw.c
index e88f610fdb7..883e9d74fdd 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -649,6 +649,9 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
struct sockaddr_can *addr =
(struct sockaddr_can *)msg->msg_name;
+ if (msg->msg_namelen < sizeof(*addr))
+ return -EINVAL;
+
if (addr->can_family != AF_CAN)
return -EINVAL;
diff --git a/net/core/dev.c b/net/core/dev.c
index 54277df0f73..578415c1ef7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
#include <trace/events/skb.h>
#include <linux/pci.h>
#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
#include "net-sysfs.h"
@@ -749,7 +750,8 @@ EXPORT_SYMBOL(dev_get_by_index);
* @ha: hardware address
*
* Search for an interface by MAC address. Returns NULL if the device
- * is not found or a pointer to the device. The caller must hold RCU
+ * is not found or a pointer to the device.
+ * The caller must hold RCU or RTNL.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
@@ -1279,13 +1281,16 @@ static int __dev_close_many(struct list_head *head)
static int __dev_close(struct net_device *dev)
{
+ int retval;
LIST_HEAD(single);
list_add(&dev->unreg_list, &single);
- return __dev_close_many(&single);
+ retval = __dev_close_many(&single);
+ list_del(&single);
+ return retval;
}
-int dev_close_many(struct list_head *head)
+static int dev_close_many(struct list_head *head)
{
struct net_device *dev, *tmp;
LIST_HEAD(tmp_list);
@@ -1324,7 +1329,7 @@ int dev_close(struct net_device *dev)
list_add(&dev->unreg_list, &single);
dev_close_many(&single);
-
+ list_del(&single);
return 0;
}
EXPORT_SYMBOL(dev_close);
@@ -1593,6 +1598,48 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
rcu_read_unlock();
}
+/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+ int i;
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+ /* If TC0 is invalidated disable TC mapping */
+ if (tc->offset + tc->count > txq) {
+ pr_warning("Number of in use tx queues changed "
+ "invalidating tc mappings. Priority "
+ "traffic classification disabled!\n");
+ dev->num_tc = 0;
+ return;
+ }
+
+ /* Invalidated prio to tc mappings set to TC0 */
+ for (i = 1; i < TC_BITMASK + 1; i++) {
+ int q = netdev_get_prio_tc_map(dev, i);
+
+ tc = &dev->tc_to_txq[q];
+ if (tc->offset + tc->count > txq) {
+ pr_warning("Number of in use tx queues "
+ "changed. Priority %i to tc "
+ "mapping %i is no longer valid "
+ "setting map to 0\n",
+ i, q);
+ netdev_set_prio_tc_map(dev, i, 0);
+ }
+ }
+}
+
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1604,7 +1651,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (txq < 1 || txq > dev->num_tx_queues)
return -EINVAL;
- if (dev->reg_state == NETREG_REGISTERED) {
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
@@ -1612,6 +1660,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (rc)
return rc;
+ if (dev->num_tc)
+ netif_setup_tc(dev, txq);
+
if (txq < dev->real_num_tx_queues)
qdisc_reset_all_tx_gt(dev, txq);
}
@@ -1811,7 +1862,7 @@ EXPORT_SYMBOL(skb_checksum_help);
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*/
-struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_type *ptype;
@@ -1999,9 +2050,9 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
protocol == htons(ETH_P_FCOE)));
}
-static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
+static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
{
- if (!can_checksum_protocol(protocol, features)) {
+ if (!can_checksum_protocol(features, protocol)) {
features &= ~NETIF_F_ALL_CSUM;
features &= ~NETIF_F_SG;
} else if (illegal_highdma(skb->dev, skb)) {
@@ -2011,10 +2062,10 @@ static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features
return features;
}
-int netif_skb_features(struct sk_buff *skb)
+u32 netif_skb_features(struct sk_buff *skb)
{
__be16 protocol = skb->protocol;
- int features = skb->dev->features;
+ u32 features = skb->dev->features;
if (protocol == htons(ETH_P_8021Q)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2023,13 +2074,13 @@ int netif_skb_features(struct sk_buff *skb)
return harmonize_features(skb, protocol, features);
}
- features &= skb->dev->vlan_features;
+ features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
if (protocol != htons(ETH_P_8021Q)) {
return harmonize_features(skb, protocol, features);
} else {
features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
- NETIF_F_GEN_CSUM;
+ NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
return harmonize_features(skb, protocol, features);
}
}
@@ -2059,7 +2110,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
int rc = NETDEV_TX_OK;
if (likely(!skb->next)) {
- int features;
+ u32 features;
/*
* If device doesnt need skb->dst, release it right now while
@@ -2161,6 +2212,8 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
unsigned int num_tx_queues)
{
u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
@@ -2169,13 +2222,19 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
return hash;
}
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+ qoffset = dev->tc_to_txq[tc].offset;
+ qcount = dev->tc_to_txq[tc].count;
+ }
+
if (skb->sk && skb->sk->sk_hash)
hash = skb->sk->sk_hash;
else
hash = (__force u16) skb->protocol ^ skb->rxhash;
hash = jhash_1word(hash, hashrnd);
- return (u16) (((u64) hash * num_tx_queues) >> 32);
+ return (u16) (((u64) hash * qcount) >> 32) + qoffset;
}
EXPORT_SYMBOL(__skb_tx_hash);
@@ -2272,15 +2331,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
- bool contended = qdisc_is_running(q);
+ bool contended;
int rc;
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
* This permits __QDISC_STATE_RUNNING owner to get the lock more often
* and dequeue packets faster.
*/
+ contended = qdisc_is_running(q);
if (unlikely(contended))
spin_lock(&q->busylock);
@@ -2298,7 +2360,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
skb_dst_force(skb);
- qdisc_skb_cb(skb)->pkt_len = skb->len;
qdisc_bstats_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
@@ -2313,7 +2374,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
skb_dst_force(skb);
- rc = qdisc_enqueue_root(skb, q);
+ rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -2532,6 +2593,54 @@ EXPORT_SYMBOL(__skb_get_rxhash);
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow *rflow, u16 next_cpu)
+{
+ u16 tcpu;
+
+ tcpu = rflow->cpu = next_cpu;
+ if (tcpu != RPS_NO_CPU) {
+#ifdef CONFIG_RFS_ACCEL
+ struct netdev_rx_queue *rxqueue;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *old_rflow;
+ u32 flow_id;
+ u16 rxq_index;
+ int rc;
+
+ /* Should we steer this flow to a different hardware queue? */
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ goto out;
+ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+ if (rxq_index == skb_get_rx_queue(skb))
+ goto out;
+
+ rxqueue = dev->_rx + rxq_index;
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (!flow_table)
+ goto out;
+ flow_id = skb->rxhash & flow_table->mask;
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+ rxq_index, flow_id);
+ if (rc < 0)
+ goto out;
+ old_rflow = rflow;
+ rflow = &flow_table->flows[flow_id];
+ rflow->cpu = next_cpu;
+ rflow->filter = rc;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = RPS_NO_FILTER;
+ out:
+#endif
+ rflow->last_qtail =
+ per_cpu(softnet_data, tcpu).input_queue_head;
+ }
+
+ return rflow;
+}
+
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
* CPU from the RPS map of the receiving queue for a given skb.
@@ -2562,7 +2671,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
map = rcu_dereference(rxqueue->rps_map);
if (map) {
- if (map->len == 1) {
+ if (map->len == 1 &&
+ !rcu_dereference_raw(rxqueue->rps_flow_table)) {
tcpu = map->cpus[0];
if (cpu_online(tcpu))
cpu = tcpu;
@@ -2602,12 +2712,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (unlikely(tcpu != next_cpu) &&
(tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
- rflow->last_qtail)) >= 0)) {
- tcpu = rflow->cpu = next_cpu;
- if (tcpu != RPS_NO_CPU)
- rflow->last_qtail = per_cpu(softnet_data,
- tcpu).input_queue_head;
- }
+ rflow->last_qtail)) >= 0))
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+
if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
@@ -2628,6 +2735,46 @@ done:
return cpu;
}
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *rflow;
+ bool expire = true;
+ int cpu;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = ACCESS_ONCE(rflow->cpu);
+ if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+ ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+ rflow->last_qtail) <
+ (int)(10 * flow_table->mask)))
+ expire = false;
+ }
+ rcu_read_unlock();
+ return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
@@ -2963,7 +3110,8 @@ static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
* duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
* ARP on active-backup slaves with arp_validate enabled.
*/
-int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
+static int __skb_bond_should_drop(struct sk_buff *skb,
+ struct net_device *master)
{
struct net_device *dev = skb->dev;
@@ -2997,14 +3145,12 @@ int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
}
return 0;
}
-EXPORT_SYMBOL(__skb_bond_should_drop);
static int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
- struct net_device *master;
struct net_device *null_or_orig;
struct net_device *orig_or_bond;
int ret = NET_RX_DROP;
@@ -3031,15 +3177,19 @@ static int __netif_receive_skb(struct sk_buff *skb)
*/
null_or_orig = NULL;
orig_dev = skb->dev;
- master = ACCESS_ONCE(orig_dev->master);
if (skb->deliver_no_wcard)
null_or_orig = orig_dev;
- else if (master) {
- if (skb_bond_should_drop(skb, master)) {
- skb->deliver_no_wcard = 1;
- null_or_orig = orig_dev; /* deliver only exact match */
- } else
- skb->dev = master;
+ else if (netif_is_bond_slave(orig_dev)) {
+ struct net_device *bond_master = ACCESS_ONCE(orig_dev->master);
+
+ if (likely(bond_master)) {
+ if (__skb_bond_should_drop(skb, bond_master)) {
+ skb->deliver_no_wcard = 1;
+ /* deliver only exact match */
+ null_or_orig = orig_dev;
+ } else
+ skb->dev = bond_master;
+ }
}
__this_cpu_inc(softnet_data.processed);
@@ -3423,6 +3573,8 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
__skb_pull(skb, skb_headlen(skb));
skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
skb->vlan_tci = 0;
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
napi->skb = skb;
}
@@ -3910,12 +4062,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)
void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct net_device *dev = (v == SEQ_START_TOKEN) ?
- first_net_device(seq_file_net(seq)) :
- next_net_device((struct net_device *)v);
+ struct net_device *dev = v;
+
+ if (v == SEQ_START_TOKEN)
+ dev = first_net_device_rcu(seq_file_net(seq));
+ else
+ dev = next_net_device_rcu(dev);
++*pos;
- return rcu_dereference(dev);
+ return dev;
}
void dev_seq_stop(struct seq_file *seq, void *v)
@@ -4199,15 +4354,14 @@ static int __init dev_proc_init(void)
/**
- * netdev_set_master - set up master/slave pair
+ * netdev_set_master - set up master pointer
* @slave: slave device
* @master: new master device
*
* Changes the master device of the slave. Pass %NULL to break the
* bonding. The caller must hold the RTNL semaphore. On a failure
* a negative errno code is returned. On success the reference counts
- * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
- * function returns zero.
+ * are adjusted and the function returns zero.
*/
int netdev_set_master(struct net_device *slave, struct net_device *master)
{
@@ -4227,6 +4381,29 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
synchronize_net();
dev_put(old);
}
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_master);
+
+/**
+ * netdev_set_bond_master - set up bonding master/slave pair
+ * @slave: slave device
+ * @master: new master device
+ *
+ * Changes the master device of the slave. Pass %NULL to break the
+ * bonding. The caller must hold the RTNL semaphore. On a failure
+ * a negative errno code is returned. On success %RTM_NEWLINK is sent
+ * to the routing socket and the function returns zero.
+ */
+int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = netdev_set_master(slave, master);
+ if (err)
+ return err;
if (master)
slave->flags |= IFF_SLAVE;
else
@@ -4235,7 +4412,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
return 0;
}
-EXPORT_SYMBOL(netdev_set_master);
+EXPORT_SYMBOL(netdev_set_bond_master);
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
@@ -4572,6 +4749,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
EXPORT_SYMBOL(dev_set_mtu);
/**
+ * dev_set_group - Change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ dev->group = new_group;
+}
+EXPORT_SYMBOL(dev_set_group);
+
+/**
* dev_set_mac_address - Change Media Access Control Address
* @dev: device
* @sa: new address
@@ -5059,43 +5247,58 @@ static void rollback_registered(struct net_device *dev)
list_add(&dev->unreg_list, &single);
rollback_registered_many(&single);
+ list_del(&single);
}
-unsigned long netdev_fix_features(unsigned long features, const char *name)
+u32 netdev_fix_features(struct net_device *dev, u32 features)
{
+ /* Fix illegal checksum combinations */
+ if ((features & NETIF_F_HW_CSUM) &&
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_info(dev, "mixed HW and IP checksum settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+ }
+
+ if ((features & NETIF_F_NO_CSUM) &&
+ (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_info(dev, "mixed no checksumming and other settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+ }
+
/* Fix illegal SG+CSUM combinations. */
if ((features & NETIF_F_SG) &&
!(features & NETIF_F_ALL_CSUM)) {
- if (name)
- printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
- "checksum feature.\n", name);
+ netdev_info(dev,
+ "Dropping NETIF_F_SG since no checksum feature.\n");
features &= ~NETIF_F_SG;
}
/* TSO requires that SG is present as well. */
if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
- if (name)
- printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
- "SG feature.\n", name);
+ netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
features &= ~NETIF_F_TSO;
}
+ /* Software GSO depends on SG. */
+ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
+ netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
+ features &= ~NETIF_F_GSO;
+ }
+
+ /* UFO needs SG and checksumming */
if (features & NETIF_F_UFO) {
/* maybe split UFO into V4 and V6? */
if (!((features & NETIF_F_GEN_CSUM) ||
(features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- if (name)
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
- "since no checksum offload features.\n",
- name);
+ netdev_info(dev,
+ "Dropping NETIF_F_UFO since no checksum offload features.\n");
features &= ~NETIF_F_UFO;
}
if (!(features & NETIF_F_SG)) {
- if (name)
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
- "since no NETIF_F_SG feature.\n", name);
+ netdev_info(dev,
+ "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
features &= ~NETIF_F_UFO;
}
}
@@ -5104,6 +5307,37 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
}
EXPORT_SYMBOL(netdev_fix_features);
+void netdev_update_features(struct net_device *dev)
+{
+ u32 features;
+ int err = 0;
+
+ features = netdev_get_wanted_features(dev);
+
+ if (dev->netdev_ops->ndo_fix_features)
+ features = dev->netdev_ops->ndo_fix_features(dev, features);
+
+ /* driver might be less strict about feature dependencies */
+ features = netdev_fix_features(dev, features);
+
+ if (dev->features == features)
+ return;
+
+ netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
+ dev->features, features);
+
+ if (dev->netdev_ops->ndo_set_features)
+ err = dev->netdev_ops->ndo_set_features(dev, features);
+
+ if (!err)
+ dev->features = features;
+ else if (err < 0)
+ netdev_err(dev,
+ "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
+ err, features, dev->features);
+}
+EXPORT_SYMBOL(netdev_update_features);
+
/**
* netif_stacked_transfer_operstate - transfer operstate
* @rootdev: the root or lower level device to transfer state from
@@ -5238,26 +5472,18 @@ int register_netdevice(struct net_device *dev)
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
- /* Fix illegal checksum combinations */
- if ((dev->features & NETIF_F_HW_CSUM) &&
- (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
- dev->name);
- dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
- }
-
- if ((dev->features & NETIF_F_NO_CSUM) &&
- (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
- dev->name);
- dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
- }
+ /* Transfer changeable features to wanted_features and enable
+ * software offloads (GSO and GRO).
+ */
+ dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->wanted_features = (dev->features & dev->hw_features)
+ | NETIF_F_SOFT_FEATURES;
- dev->features = netdev_fix_features(dev->features, dev->name);
+ /* Avoid warning from netdev_fix_features() for GSO without SG */
+ if (!(dev->wanted_features & NETIF_F_SG))
+ dev->wanted_features &= ~NETIF_F_GSO;
- /* Enable software GSO if SG is supported. */
- if (dev->features & NETIF_F_SG)
- dev->features |= NETIF_F_GSO;
+ netdev_update_features(dev);
/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
* vlan_dev_init() will do the dev->features check, so these features
@@ -5656,30 +5882,36 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
+ dev->gso_max_size = GSO_MAX_SIZE;
+
+ INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
+ dev->ethtool_ntuple_list.count = 0;
+ INIT_LIST_HEAD(&dev->napi_list);
+ INIT_LIST_HEAD(&dev->unreg_list);
+ INIT_LIST_HEAD(&dev->link_watch_list);
+ dev->priv_flags = IFF_XMIT_DST_RELEASE;
+ setup(dev);
+
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev))
- goto free_pcpu;
+ goto free_all;
#ifdef CONFIG_RPS
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
- goto free_pcpu;
+ goto free_all;
#endif
- dev->gso_max_size = GSO_MAX_SIZE;
-
- INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
- dev->ethtool_ntuple_list.count = 0;
- INIT_LIST_HEAD(&dev->napi_list);
- INIT_LIST_HEAD(&dev->unreg_list);
- INIT_LIST_HEAD(&dev->link_watch_list);
- dev->priv_flags = IFF_XMIT_DST_RELEASE;
- setup(dev);
strcpy(dev->name, name);
+ dev->group = INIT_NETDEV_GROUP;
return dev;
+free_all:
+ free_netdev(dev);
+ return NULL;
+
free_pcpu:
free_percpu(dev->pcpu_refcnt);
kfree(dev->_tx);
@@ -5988,8 +6220,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
* @one to the master device with current feature set @all. Will not
* enable anything that is off in @mask. Returns the new feature set.
*/
-unsigned long netdev_increment_features(unsigned long all, unsigned long one,
- unsigned long mask)
+u32 netdev_increment_features(u32 all, u32 one, u32 mask)
{
/* If device needs checksumming, downgrade to it. */
if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
@@ -6207,6 +6438,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
}
unregister_netdevice_many(&dev_kill_list);
+ list_del(&dev_kill_list);
rtnl_unlock();
}
diff --git a/net/core/dst.c b/net/core/dst.c
index b99c7c7ffce..91104d35de7 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -164,7 +164,9 @@ int dst_discard(struct sk_buff *skb)
}
EXPORT_SYMBOL(dst_discard);
-void *dst_alloc(struct dst_ops *ops)
+const u32 dst_default_metrics[RTAX_MAX];
+
+void *dst_alloc(struct dst_ops *ops, int initial_ref)
{
struct dst_entry *dst;
@@ -175,11 +177,12 @@ void *dst_alloc(struct dst_ops *ops)
dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC);
if (!dst)
return NULL;
- atomic_set(&dst->__refcnt, 0);
+ atomic_set(&dst->__refcnt, initial_ref);
dst->ops = ops;
dst->lastuse = jiffies;
dst->path = dst;
dst->input = dst->output = dst_discard;
+ dst_init_metrics(dst, dst_default_metrics, true);
#if RT_CACHE_DEBUG >= 2
atomic_inc(&dst_total);
#endif
@@ -282,6 +285,42 @@ void dst_release(struct dst_entry *dst)
}
EXPORT_SYMBOL(dst_release);
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+ if (p) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ kfree(p);
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ }
+ }
+ return p;
+}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
+
+/* Caller asserts that dst_metrics_read_only(dst) is false. */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ unsigned long prev, new;
+
+ new = (unsigned long) dst_default_metrics;
+ prev = cmpxchg(&dst->_metrics, old, new);
+ if (prev == old)
+ kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+
/**
* skb_dst_set_noref - sets skb dst, without a reference
* @skb: buffer
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 17741782a34..66cdc76770c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -34,12 +34,6 @@ u32 ethtool_op_get_link(struct net_device *dev)
}
EXPORT_SYMBOL(ethtool_op_get_link);
-u32 ethtool_op_get_rx_csum(struct net_device *dev)
-{
- return (dev->features & NETIF_F_ALL_CSUM) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_rx_csum);
-
u32 ethtool_op_get_tx_csum(struct net_device *dev)
{
return (dev->features & NETIF_F_ALL_CSUM) != 0;
@@ -55,6 +49,7 @@ int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
return 0;
}
+EXPORT_SYMBOL(ethtool_op_set_tx_csum);
int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
{
@@ -171,6 +166,306 @@ EXPORT_SYMBOL(ethtool_ntuple_flush);
/* Handlers for each ethtool command */
+#define ETHTOOL_DEV_FEATURE_WORDS 1
+
+static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_gfeatures cmd = {
+ .cmd = ETHTOOL_GFEATURES,
+ .size = ETHTOOL_DEV_FEATURE_WORDS,
+ };
+ struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = {
+ {
+ .available = dev->hw_features,
+ .requested = dev->wanted_features,
+ .active = dev->features,
+ .never_changed = NETIF_F_NEVER_CHANGE,
+ },
+ };
+ u32 __user *sizeaddr;
+ u32 copy_size;
+
+ sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
+ if (get_user(copy_size, sizeaddr))
+ return -EFAULT;
+
+ if (copy_size > ETHTOOL_DEV_FEATURE_WORDS)
+ copy_size = ETHTOOL_DEV_FEATURE_WORDS;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+ if (copy_to_user(useraddr, features, copy_size * sizeof(*features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_sfeatures cmd;
+ struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ int ret = 0;
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+
+ if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS)
+ return -EINVAL;
+
+ if (copy_from_user(features, useraddr, sizeof(features)))
+ return -EFAULT;
+
+ if (features[0].valid & ~NETIF_F_ETHTOOL_BITS)
+ return -EINVAL;
+
+ if (features[0].valid & ~dev->hw_features) {
+ features[0].valid &= dev->hw_features;
+ ret |= ETHTOOL_F_UNSUPPORTED;
+ }
+
+ dev->wanted_features &= ~features[0].valid;
+ dev->wanted_features |= features[0].valid & features[0].requested;
+ netdev_update_features(dev);
+
+ if ((dev->wanted_features ^ dev->features) & features[0].valid)
+ ret |= ETHTOOL_F_WISH;
+
+ return ret;
+}
+
+static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = {
+ /* NETIF_F_SG */ "tx-scatter-gather",
+ /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4",
+ /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded",
+ /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic",
+ /* NETIF_F_IPV6_CSUM */ "tx_checksum-ipv6",
+ /* NETIF_F_HIGHDMA */ "highdma",
+ /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist",
+ /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert",
+
+ /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse",
+ /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter",
+ /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged",
+ /* NETIF_F_GSO */ "tx-generic-segmentation",
+ /* NETIF_F_LLTX */ "tx-lockless",
+ /* NETIF_F_NETNS_LOCAL */ "netns-local",
+ /* NETIF_F_GRO */ "rx-gro",
+ /* NETIF_F_LRO */ "rx-lro",
+
+ /* NETIF_F_TSO */ "tx-tcp-segmentation",
+ /* NETIF_F_UFO */ "tx-udp-fragmentation",
+ /* NETIF_F_GSO_ROBUST */ "tx-gso-robust",
+ /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation",
+ /* NETIF_F_TSO6 */ "tx-tcp6-segmentation",
+ /* NETIF_F_FSO */ "tx-fcoe-segmentation",
+ "",
+ "",
+
+ /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc",
+ /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp",
+ /* NETIF_F_FCOE_MTU */ "fcoe-mtu",
+ /* NETIF_F_NTUPLE */ "rx-ntuple-filter",
+ /* NETIF_F_RXHASH */ "rx-hashing",
+ /* NETIF_F_RXCSUM */ "rx-checksum",
+ "",
+ "",
+};
+
+static int __ethtool_get_sset_count(struct net_device *dev, int sset)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (sset == ETH_SS_FEATURES)
+ return ARRAY_SIZE(netdev_features_strings);
+
+ if (ops && ops->get_sset_count && ops->get_strings)
+ return ops->get_sset_count(dev, sset);
+ else
+ return -EOPNOTSUPP;
+}
+
+static void __ethtool_get_strings(struct net_device *dev,
+ u32 stringset, u8 *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (stringset == ETH_SS_FEATURES)
+ memcpy(data, netdev_features_strings,
+ sizeof(netdev_features_strings));
+ else
+ /* ops->get_strings is valid because checked earlier */
+ ops->get_strings(dev, stringset, data);
+}
+
+static u32 ethtool_get_feature_mask(u32 eth_cmd)
+{
+ /* feature masks of legacy discrete ethtool ops */
+
+ switch (eth_cmd) {
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_STXCSUM:
+ return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_SRXCSUM:
+ return NETIF_F_RXCSUM;
+ case ETHTOOL_GSG:
+ case ETHTOOL_SSG:
+ return NETIF_F_SG;
+ case ETHTOOL_GTSO:
+ case ETHTOOL_STSO:
+ return NETIF_F_ALL_TSO;
+ case ETHTOOL_GUFO:
+ case ETHTOOL_SUFO:
+ return NETIF_F_UFO;
+ case ETHTOOL_GGSO:
+ case ETHTOOL_SGSO:
+ return NETIF_F_GSO;
+ case ETHTOOL_GGRO:
+ case ETHTOOL_SGRO:
+ return NETIF_F_GRO;
+ default:
+ BUG();
+ }
+}
+
+static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops)
+ return NULL;
+
+ switch (ethcmd) {
+ case ETHTOOL_GTXCSUM:
+ return ops->get_tx_csum;
+ case ETHTOOL_GRXCSUM:
+ return ops->get_rx_csum;
+ case ETHTOOL_SSG:
+ return ops->get_sg;
+ case ETHTOOL_STSO:
+ return ops->get_tso;
+ case ETHTOOL_SUFO:
+ return ops->get_ufo;
+ default:
+ return NULL;
+ }
+}
+
+static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev)
+{
+ return !!(dev->features & NETIF_F_ALL_CSUM);
+}
+
+static int ethtool_get_one_feature(struct net_device *dev,
+ char __user *useraddr, u32 ethcmd)
+{
+ u32 mask = ethtool_get_feature_mask(ethcmd);
+ struct ethtool_value edata = {
+ .cmd = ethcmd,
+ .data = !!(dev->features & mask),
+ };
+
+ /* compatibility with discrete get_ ops */
+ if (!(dev->hw_features & mask)) {
+ u32 (*actor)(struct net_device *);
+
+ actor = __ethtool_get_one_feature_actor(dev, ethcmd);
+
+ /* bug compatibility with old get_rx_csum */
+ if (ethcmd == ETHTOOL_GRXCSUM && !actor)
+ actor = __ethtool_get_rx_csum_oldbug;
+
+ if (actor)
+ edata.data = actor(dev);
+ }
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int __ethtool_set_tx_csum(struct net_device *dev, u32 data);
+static int __ethtool_set_rx_csum(struct net_device *dev, u32 data);
+static int __ethtool_set_sg(struct net_device *dev, u32 data);
+static int __ethtool_set_tso(struct net_device *dev, u32 data);
+static int __ethtool_set_ufo(struct net_device *dev, u32 data);
+
+static int ethtool_set_one_feature(struct net_device *dev,
+ void __user *useraddr, u32 ethcmd)
+{
+ struct ethtool_value edata;
+ u32 mask;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ mask = ethtool_get_feature_mask(ethcmd);
+ mask &= dev->hw_features;
+ if (mask) {
+ if (edata.data)
+ dev->wanted_features |= mask;
+ else
+ dev->wanted_features &= ~mask;
+
+ netdev_update_features(dev);
+ return 0;
+ }
+
+ /* Driver is not converted to ndo_fix_features or does not
+ * support changing this offload. In the latter case it won't
+ * have corresponding ethtool_ops field set.
+ *
+ * Following part is to be removed after all drivers advertise
+ * their changeable features in netdev->hw_features and stop
+ * using discrete offload setting ops.
+ */
+
+ switch (ethcmd) {
+ case ETHTOOL_STXCSUM:
+ return __ethtool_set_tx_csum(dev, edata.data);
+ case ETHTOOL_SRXCSUM:
+ return __ethtool_set_rx_csum(dev, edata.data);
+ case ETHTOOL_SSG:
+ return __ethtool_set_sg(dev, edata.data);
+ case ETHTOOL_STSO:
+ return __ethtool_set_tso(dev, edata.data);
+ case ETHTOOL_SUFO:
+ return __ethtool_set_ufo(dev, edata.data);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+ u32 changed;
+
+ if (data & ~flags_dup_features)
+ return -EINVAL;
+
+ /* legacy set_flags() op */
+ if (dev->ethtool_ops->set_flags) {
+ if (unlikely(dev->hw_features & flags_dup_features))
+ netdev_warn(dev,
+ "driver BUG: mixed hw_features and set_flags()\n");
+ return dev->ethtool_ops->set_flags(dev, data);
+ }
+
+ /* allow changing only bits set in hw_features */
+ changed = (data ^ dev->wanted_features) & flags_dup_features;
+ if (changed & ~dev->hw_features)
+ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+ dev->wanted_features =
+ (dev->wanted_features & ~changed) | data;
+
+ netdev_update_features(dev);
+
+ return 0;
+}
+
static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
{
struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET };
@@ -251,14 +546,10 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
void __user *useraddr)
{
struct ethtool_sset_info info;
- const struct ethtool_ops *ops = dev->ethtool_ops;
u64 sset_mask;
int i, idx = 0, n_bits = 0, ret, rc;
u32 *info_buf = NULL;
- if (!ops->get_sset_count)
- return -EOPNOTSUPP;
-
if (copy_from_user(&info, useraddr, sizeof(info)))
return -EFAULT;
@@ -285,7 +576,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
if (!(sset_mask & (1ULL << i)))
continue;
- rc = ops->get_sset_count(dev, i);
+ rc = __ethtool_get_sset_count(dev, i);
if (rc >= 0) {
info.sset_mask |= (1ULL << i);
info_buf[idx++] = rc;
@@ -817,7 +1108,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (regs.len > reglen)
regs.len = reglen;
- regbuf = vmalloc(reglen);
+ regbuf = vzalloc(reglen);
if (!regbuf)
return -ENOMEM;
@@ -1091,6 +1382,9 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data)
{
int err;
+ if (data && !(dev->features & NETIF_F_ALL_CSUM))
+ return -EINVAL;
+
if (!data && dev->ethtool_ops->set_tso) {
err = dev->ethtool_ops->set_tso(dev, 0);
if (err)
@@ -1105,145 +1399,55 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data)
return dev->ethtool_ops->set_sg(dev, data);
}
-static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr)
+static int __ethtool_set_tx_csum(struct net_device *dev, u32 data)
{
- struct ethtool_value edata;
int err;
if (!dev->ethtool_ops->set_tx_csum)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (!edata.data && dev->ethtool_ops->set_sg) {
+ if (!data && dev->ethtool_ops->set_sg) {
err = __ethtool_set_sg(dev, 0);
if (err)
return err;
}
- return dev->ethtool_ops->set_tx_csum(dev, edata.data);
+ return dev->ethtool_ops->set_tx_csum(dev, data);
}
-EXPORT_SYMBOL(ethtool_op_set_tx_csum);
-static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr)
+static int __ethtool_set_rx_csum(struct net_device *dev, u32 data)
{
- struct ethtool_value edata;
-
if (!dev->ethtool_ops->set_rx_csum)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (!edata.data && dev->ethtool_ops->set_sg)
+ if (!data)
dev->features &= ~NETIF_F_GRO;
- return dev->ethtool_ops->set_rx_csum(dev, edata.data);
+ return dev->ethtool_ops->set_rx_csum(dev, data);
}
-static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
+static int __ethtool_set_tso(struct net_device *dev, u32 data)
{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_sg)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data &&
- !(dev->features & NETIF_F_ALL_CSUM))
- return -EINVAL;
-
- return __ethtool_set_sg(dev, edata.data);
-}
-
-static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
if (!dev->ethtool_ops->set_tso)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data && !(dev->features & NETIF_F_SG))
+ if (data && !(dev->features & NETIF_F_SG))
return -EINVAL;
- return dev->ethtool_ops->set_tso(dev, edata.data);
+ return dev->ethtool_ops->set_tso(dev, data);
}
-static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
+static int __ethtool_set_ufo(struct net_device *dev, u32 data)
{
- struct ethtool_value edata;
-
if (!dev->ethtool_ops->set_ufo)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
- if (edata.data && !(dev->features & NETIF_F_SG))
+ if (data && !(dev->features & NETIF_F_SG))
return -EINVAL;
- if (edata.data && !((dev->features & NETIF_F_GEN_CSUM) ||
+ if (data && !((dev->features & NETIF_F_GEN_CSUM) ||
(dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)))
return -EINVAL;
- return dev->ethtool_ops->set_ufo(dev, edata.data);
-}
-
-static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GGSO };
-
- edata.data = dev->features & NETIF_F_GSO;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
- if (edata.data)
- dev->features |= NETIF_F_GSO;
- else
- dev->features &= ~NETIF_F_GSO;
- return 0;
-}
-
-static int ethtool_get_gro(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GGRO };
-
- edata.data = dev->features & NETIF_F_GRO;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_gro(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data) {
- u32 rxcsum = dev->ethtool_ops->get_rx_csum ?
- dev->ethtool_ops->get_rx_csum(dev) :
- ethtool_op_get_rx_csum(dev);
-
- if (!rxcsum)
- return -EINVAL;
- dev->features |= NETIF_F_GRO;
- } else
- dev->features &= ~NETIF_F_GRO;
-
- return 0;
+ return dev->ethtool_ops->set_ufo(dev, data);
}
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
@@ -1287,17 +1491,13 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gstrings gstrings;
- const struct ethtool_ops *ops = dev->ethtool_ops;
u8 *data;
int ret;
- if (!ops->get_strings || !ops->get_sset_count)
- return -EOPNOTSUPP;
-
if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
return -EFAULT;
- ret = ops->get_sset_count(dev, gstrings.string_set);
+ ret = __ethtool_get_sset_count(dev, gstrings.string_set);
if (ret < 0)
return ret;
@@ -1307,7 +1507,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
if (!data)
return -ENOMEM;
- ops->get_strings(dev, gstrings.string_set, data);
+ __ethtool_get_strings(dev, gstrings.string_set, data);
ret = -EFAULT;
if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
@@ -1317,7 +1517,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
goto out;
ret = 0;
- out:
+out:
kfree(data);
return ret;
}
@@ -1458,7 +1658,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
void __user *useraddr = ifr->ifr_data;
u32 ethcmd;
int rc;
- unsigned long old_features;
+ u32 old_features;
if (!dev || !netif_device_present(dev))
return -ENODEV;
@@ -1500,6 +1700,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GRXCLSRLCNT:
case ETHTOOL_GRXCLSRULE:
case ETHTOOL_GRXCLSRLALL:
+ case ETHTOOL_GFEATURES:
break;
default:
if (!capable(CAP_NET_ADMIN))
@@ -1570,42 +1771,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SPAUSEPARAM:
rc = ethtool_set_pauseparam(dev, useraddr);
break;
- case ETHTOOL_GRXCSUM:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_rx_csum ?
- dev->ethtool_ops->get_rx_csum :
- ethtool_op_get_rx_csum));
- break;
- case ETHTOOL_SRXCSUM:
- rc = ethtool_set_rx_csum(dev, useraddr);
- break;
- case ETHTOOL_GTXCSUM:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_tx_csum ?
- dev->ethtool_ops->get_tx_csum :
- ethtool_op_get_tx_csum));
- break;
- case ETHTOOL_STXCSUM:
- rc = ethtool_set_tx_csum(dev, useraddr);
- break;
- case ETHTOOL_GSG:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_sg ?
- dev->ethtool_ops->get_sg :
- ethtool_op_get_sg));
- break;
- case ETHTOOL_SSG:
- rc = ethtool_set_sg(dev, useraddr);
- break;
- case ETHTOOL_GTSO:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_tso ?
- dev->ethtool_ops->get_tso :
- ethtool_op_get_tso));
- break;
- case ETHTOOL_STSO:
- rc = ethtool_set_tso(dev, useraddr);
- break;
case ETHTOOL_TEST:
rc = ethtool_self_test(dev, useraddr);
break;
@@ -1621,21 +1786,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GPERMADDR:
rc = ethtool_get_perm_addr(dev, useraddr);
break;
- case ETHTOOL_GUFO:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_ufo ?
- dev->ethtool_ops->get_ufo :
- ethtool_op_get_ufo));
- break;
- case ETHTOOL_SUFO:
- rc = ethtool_set_ufo(dev, useraddr);
- break;
- case ETHTOOL_GGSO:
- rc = ethtool_get_gso(dev, useraddr);
- break;
- case ETHTOOL_SGSO:
- rc = ethtool_set_gso(dev, useraddr);
- break;
case ETHTOOL_GFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
(dev->ethtool_ops->get_flags ?
@@ -1643,8 +1793,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
ethtool_op_get_flags));
break;
case ETHTOOL_SFLAGS:
- rc = ethtool_set_value(dev, useraddr,
- dev->ethtool_ops->set_flags);
+ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
break;
case ETHTOOL_GPFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
@@ -1666,12 +1815,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SRXCLSRLINS:
rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
break;
- case ETHTOOL_GGRO:
- rc = ethtool_get_gro(dev, useraddr);
- break;
- case ETHTOOL_SGRO:
- rc = ethtool_set_gro(dev, useraddr);
- break;
case ETHTOOL_FLASHDEV:
rc = ethtool_flash_device(dev, useraddr);
break;
@@ -1693,6 +1836,30 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SRXFHINDIR:
rc = ethtool_set_rxfh_indir(dev, useraddr);
break;
+ case ETHTOOL_GFEATURES:
+ rc = ethtool_get_features(dev, useraddr);
+ break;
+ case ETHTOOL_SFEATURES:
+ rc = ethtool_set_features(dev, useraddr);
+ break;
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GUFO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_STXCSUM:
+ case ETHTOOL_SRXCSUM:
+ case ETHTOOL_SSG:
+ case ETHTOOL_STSO:
+ case ETHTOOL_SUFO:
+ case ETHTOOL_SGSO:
+ case ETHTOOL_SGRO:
+ rc = ethtool_set_one_feature(dev, useraddr, ethcmd);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index afc58374ca9..232b1873bb2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -142,14 +142,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
if (err)
return err;
- rcu_read_lock_bh();
- filter = rcu_dereference_bh(sk->sk_filter);
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
if (filter) {
unsigned int pkt_len = sk_run_filter(skb, filter->insns);
err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return err;
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 60a90291342..799f06e03a2 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -316,7 +316,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
{
size_t size = entries * sizeof(struct neighbour *);
struct neigh_hash_table *ret;
- struct neighbour **buckets;
+ struct neighbour __rcu **buckets;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
@@ -324,14 +324,14 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
if (size <= PAGE_SIZE)
buckets = kzalloc(size, GFP_ATOMIC);
else
- buckets = (struct neighbour **)
+ buckets = (struct neighbour __rcu **)
__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
get_order(size));
if (!buckets) {
kfree(ret);
return NULL;
}
- rcu_assign_pointer(ret->hash_buckets, buckets);
+ ret->hash_buckets = buckets;
ret->hash_mask = entries - 1;
get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
return ret;
@@ -343,7 +343,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table,
rcu);
size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
- struct neighbour **buckets = nht->hash_buckets;
+ struct neighbour __rcu **buckets = nht->hash_buckets;
if (size <= PAGE_SIZE)
kfree(buckets);
@@ -1540,7 +1540,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
panic("cannot create neighbour proc dir entry");
#endif
- tbl->nht = neigh_hash_alloc(8);
+ RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(8));
phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
@@ -1602,7 +1602,8 @@ int neigh_table_clear(struct neigh_table *tbl)
}
write_unlock(&neigh_tbl_lock);
- call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu);
+ call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
+ neigh_hash_free_rcu);
tbl->nht = NULL;
kfree(tbl->phash_buckets);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e23c01be5a5..5ceb257e860 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -99,7 +99,7 @@ NETDEVICE_SHOW(addr_assign_type, fmt_dec);
NETDEVICE_SHOW(addr_len, fmt_dec);
NETDEVICE_SHOW(iflink, fmt_dec);
NETDEVICE_SHOW(ifindex, fmt_dec);
-NETDEVICE_SHOW(features, fmt_long_hex);
+NETDEVICE_SHOW(features, fmt_hex);
NETDEVICE_SHOW(type, fmt_dec);
NETDEVICE_SHOW(link_mode, fmt_dec);
@@ -295,6 +295,20 @@ static ssize_t show_ifalias(struct device *dev,
return ret;
}
+NETDEVICE_SHOW(group, fmt_dec);
+
+static int change_group(struct net_device *net, unsigned long new_group)
+{
+ dev_set_group(net, (int) new_group);
+ return 0;
+}
+
+static ssize_t store_group(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_group);
+}
+
static struct device_attribute net_class_attributes[] = {
__ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL),
__ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
@@ -316,6 +330,7 @@ static struct device_attribute net_class_attributes[] = {
__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
store_tx_queue_len),
+ __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group),
{}
};
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index a9e7fc4c461..d73b77adb67 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -251,6 +251,7 @@ struct pktgen_dev {
int max_pkt_size; /* = ETH_ZLEN; */
int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
int nfrags;
+ struct page *page;
u64 delay; /* nano-seconds */
__u64 count; /* Default No packets to send */
@@ -1134,6 +1135,10 @@ static ssize_t pktgen_if_write(struct file *file,
if (node_possible(value)) {
pkt_dev->node = value;
sprintf(pg_result, "OK: node=%d", pkt_dev->node);
+ if (pkt_dev->page) {
+ put_page(pkt_dev->page);
+ pkt_dev->page = NULL;
+ }
}
else
sprintf(pg_result, "ERROR: node not possible");
@@ -2605,6 +2610,90 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi,
return htons(id | (cfi << 12) | (prio << 13));
}
+static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
+ int datalen)
+{
+ struct timeval timestamp;
+ struct pktgen_hdr *pgh;
+
+ pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh));
+ datalen -= sizeof(*pgh);
+
+ if (pkt_dev->nfrags <= 0) {
+ pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
+ memset(pgh + 1, 0, datalen);
+ } else {
+ int frags = pkt_dev->nfrags;
+ int i, len;
+
+
+ if (frags > MAX_SKB_FRAGS)
+ frags = MAX_SKB_FRAGS;
+ len = datalen - frags * PAGE_SIZE;
+ if (len > 0) {
+ memset(skb_put(skb, len), 0, len);
+ datalen = frags * PAGE_SIZE;
+ }
+
+ i = 0;
+ while (datalen > 0) {
+ if (unlikely(!pkt_dev->page)) {
+ int node = numa_node_id();
+
+ if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
+ node = pkt_dev->node;
+ pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!pkt_dev->page)
+ break;
+ }
+ skb_shinfo(skb)->frags[i].page = pkt_dev->page;
+ get_page(pkt_dev->page);
+ skb_shinfo(skb)->frags[i].page_offset = 0;
+ skb_shinfo(skb)->frags[i].size =
+ (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
+ datalen -= skb_shinfo(skb)->frags[i].size;
+ skb->len += skb_shinfo(skb)->frags[i].size;
+ skb->data_len += skb_shinfo(skb)->frags[i].size;
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+
+ while (i < frags) {
+ int rem;
+
+ if (i == 0)
+ break;
+
+ rem = skb_shinfo(skb)->frags[i - 1].size / 2;
+ if (rem == 0)
+ break;
+
+ skb_shinfo(skb)->frags[i - 1].size -= rem;
+
+ skb_shinfo(skb)->frags[i] =
+ skb_shinfo(skb)->frags[i - 1];
+ get_page(skb_shinfo(skb)->frags[i].page);
+ skb_shinfo(skb)->frags[i].page =
+ skb_shinfo(skb)->frags[i - 1].page;
+ skb_shinfo(skb)->frags[i].page_offset +=
+ skb_shinfo(skb)->frags[i - 1].size;
+ skb_shinfo(skb)->frags[i].size = rem;
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+ }
+
+ /* Stamp the time, and sequence number,
+ * convert them to network byte order
+ */
+ pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+ pgh->seq_num = htonl(pkt_dev->seq_num);
+
+ do_gettimeofday(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_usec);
+}
+
static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
struct pktgen_dev *pkt_dev)
{
@@ -2613,7 +2702,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
struct udphdr *udph;
int datalen, iplen;
struct iphdr *iph;
- struct pktgen_hdr *pgh = NULL;
__be16 protocol = htons(ETH_P_IP);
__be32 *mpls;
__be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -2729,76 +2817,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
pkt_dev->pkt_overhead);
skb->dev = odev;
skb->pkt_type = PACKET_HOST;
-
- if (pkt_dev->nfrags <= 0) {
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr));
- } else {
- int frags = pkt_dev->nfrags;
- int i, len;
-
- pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
-
- if (frags > MAX_SKB_FRAGS)
- frags = MAX_SKB_FRAGS;
- if (datalen > frags * PAGE_SIZE) {
- len = datalen - frags * PAGE_SIZE;
- memset(skb_put(skb, len), 0, len);
- datalen = frags * PAGE_SIZE;
- }
-
- i = 0;
- while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
- skb_shinfo(skb)->frags[i].page = page;
- skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
-
- while (i < frags) {
- int rem;
-
- if (i == 0)
- break;
-
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
-
- skb_shinfo(skb)->frags[i - 1].size -= rem;
-
- skb_shinfo(skb)->frags[i] =
- skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page =
- skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset +=
- skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
- }
-
- /* Stamp the time, and sequence number,
- * convert them to network byte order
- */
- if (pgh) {
- struct timeval timestamp;
-
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
-
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
- }
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
#ifdef CONFIG_XFRM
if (!process_ipsec(pkt_dev, skb, protocol))
@@ -2980,7 +2999,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
struct udphdr *udph;
int datalen;
struct ipv6hdr *iph;
- struct pktgen_hdr *pgh = NULL;
__be16 protocol = htons(ETH_P_IPV6);
__be32 *mpls;
__be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -3083,75 +3101,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
skb->dev = odev;
skb->pkt_type = PACKET_HOST;
- if (pkt_dev->nfrags <= 0)
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- else {
- int frags = pkt_dev->nfrags;
- int i;
-
- pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
-
- if (frags > MAX_SKB_FRAGS)
- frags = MAX_SKB_FRAGS;
- if (datalen > frags * PAGE_SIZE) {
- skb_put(skb, datalen - frags * PAGE_SIZE);
- datalen = frags * PAGE_SIZE;
- }
-
- i = 0;
- while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- skb_shinfo(skb)->frags[i].page = page;
- skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
-
- while (i < frags) {
- int rem;
-
- if (i == 0)
- break;
-
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
-
- skb_shinfo(skb)->frags[i - 1].size -= rem;
-
- skb_shinfo(skb)->frags[i] =
- skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page =
- skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset +=
- skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
- }
-
- /* Stamp the time, and sequence number,
- * convert them to network byte order
- * should we update cloned packets too ?
- */
- if (pgh) {
- struct timeval timestamp;
-
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
-
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
- }
- /* pkt_dev->seq_num++; FF: you really mean this? */
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
return skb;
}
@@ -3884,6 +3834,8 @@ static int pktgen_remove_device(struct pktgen_thread *t,
free_SAs(pkt_dev);
#endif
vfree(pkt_dev->flows);
+ if (pkt_dev->page)
+ put_page(pkt_dev->page);
kfree(pkt_dev);
return 0;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a5f7535aab5..49f7ea5b4c7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -868,6 +868,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+ NLA_PUT_U32(skb, IFLA_GROUP, dev->group);
if (dev->ifindex != dev->iflink)
NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -1035,6 +1036,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
[IFLA_MTU] = { .type = NLA_U32 },
[IFLA_LINK] = { .type = NLA_U32 },
+ [IFLA_MASTER] = { .type = NLA_U32 },
[IFLA_TXQLEN] = { .type = NLA_U32 },
[IFLA_WEIGHT] = { .type = NLA_U32 },
[IFLA_OPERSTATE] = { .type = NLA_U8 },
@@ -1121,8 +1123,7 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
return -EOPNOTSUPP;
if (af_ops->validate_link_af) {
- err = af_ops->validate_link_af(dev,
- tb[IFLA_AF_SPEC]);
+ err = af_ops->validate_link_af(dev, af);
if (err < 0)
return err;
}
@@ -1178,6 +1179,41 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
return err;
}
+static int do_set_master(struct net_device *dev, int ifindex)
+{
+ struct net_device *master_dev;
+ const struct net_device_ops *ops;
+ int err;
+
+ if (dev->master) {
+ if (dev->master->ifindex == ifindex)
+ return 0;
+ ops = dev->master->netdev_ops;
+ if (ops->ndo_del_slave) {
+ err = ops->ndo_del_slave(dev->master, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (ifindex) {
+ master_dev = __dev_get_by_index(dev_net(dev), ifindex);
+ if (!master_dev)
+ return -EINVAL;
+ ops = master_dev->netdev_ops;
+ if (ops->ndo_add_slave) {
+ err = ops->ndo_add_slave(master_dev, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
struct nlattr **tb, char *ifname, int modified)
{
@@ -1265,6 +1301,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
modified = 1;
}
+ if (tb[IFLA_GROUP]) {
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ modified = 1;
+ }
+
/*
* Interface selected by interface index but interface
* name provided implies that a name change has been
@@ -1296,6 +1337,13 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
goto errout;
}
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+ if (err)
+ goto errout;
+ modified = 1;
+ }
+
if (tb[IFLA_TXQLEN])
dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
@@ -1542,6 +1590,8 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
if (tb[IFLA_LINKMODE])
dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ if (tb[IFLA_GROUP])
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
return dev;
@@ -1552,6 +1602,24 @@ err:
}
EXPORT_SYMBOL(rtnl_create_link);
+static int rtnl_group_changelink(struct net *net, int group,
+ struct ifinfomsg *ifm,
+ struct nlattr **tb)
+{
+ struct net_device *dev;
+ int err;
+
+ for_each_netdev(net, dev) {
+ if (dev->group == group) {
+ err = do_setlink(dev, ifm, tb, NULL, 0);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
{
struct net *net = sock_net(skb->sk);
@@ -1579,10 +1647,12 @@ replay:
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
- else if (ifname[0])
- dev = __dev_get_by_name(net, ifname);
- else
- dev = NULL;
+ else {
+ if (ifname[0])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ dev = NULL;
+ }
err = validate_linkmsg(dev, tb);
if (err < 0)
@@ -1646,8 +1716,13 @@ replay:
return do_setlink(dev, ifm, tb, ifname, modified);
}
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
+ return rtnl_group_changelink(net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, tb);
return -ENODEV;
+ }
if (ifm->ifi_index)
return -EOPNOTSUPP;
@@ -1672,6 +1747,9 @@ replay:
snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
dest_net = rtnl_link_get_net(net, tb);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+
dev = rtnl_create_link(net, dest_net, ifname, ops, tb);
if (IS_ERR(dev))
@@ -1820,7 +1898,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN))
return -EPERM;
- if (kind == 2 && (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
struct sock *rtnl;
rtnl_dumpit_func dumpit;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d31bb36ae0d..14cf560b4a3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -210,6 +210,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
if (fclone) {
struct sk_buff *child = skb + 1;
@@ -2497,7 +2498,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
* a pointer to the first in a list of new skbs for the segments.
* In case of error it returns ERR_PTR(err).
*/
-struct sk_buff *skb_segment(struct sk_buff *skb, int features)
+struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
@@ -2507,7 +2508,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
unsigned int offset = doffset;
unsigned int headroom;
unsigned int len;
- int sg = features & NETIF_F_SG;
+ int sg = !!(features & NETIF_F_SG);
int nfrags = skb_shinfo(skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
@@ -2744,8 +2745,12 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
merge:
if (offset > headlen) {
- skbinfo->frags[0].page_offset += offset - headlen;
- skbinfo->frags[0].size -= offset - headlen;
+ unsigned int eat = offset - headlen;
+
+ skbinfo->frags[0].page_offset += eat;
+ skbinfo->frags[0].size -= eat;
+ skb->data_len -= eat;
+ skb->len -= eat;
offset = headlen;
}
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index d900ab99814..d5074a56728 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -583,7 +583,7 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb,
u8 up, idtype;
int ret = -EINVAL;
- if (!tb[DCB_ATTR_APP] || !netdev->dcbnl_ops->getapp)
+ if (!tb[DCB_ATTR_APP])
goto out;
ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP],
@@ -604,7 +604,16 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb,
goto out;
id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);
- up = netdev->dcbnl_ops->getapp(netdev, idtype, id);
+
+ if (netdev->dcbnl_ops->getapp) {
+ up = netdev->dcbnl_ops->getapp(netdev, idtype, id);
+ } else {
+ struct dcb_app app = {
+ .selector = idtype,
+ .protocol = id,
+ };
+ up = dcb_getapp(netdev, &app);
+ }
/* send this back */
dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
@@ -617,6 +626,9 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb,
dcb->cmd = DCB_CMD_GAPP;
app_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_APP);
+ if (!app_nest)
+ goto out_cancel;
+
ret = nla_put_u8(dcbnl_skb, DCB_APP_ATTR_IDTYPE, idtype);
if (ret)
goto out_cancel;
@@ -1604,6 +1616,10 @@ EXPORT_SYMBOL(dcb_getapp);
u8 dcb_setapp(struct net_device *dev, struct dcb_app *new)
{
struct dcb_app_type *itr;
+ struct dcb_app_type event;
+
+ memcpy(&event.name, dev->name, sizeof(event.name));
+ memcpy(&event.app, new, sizeof(event.app));
spin_lock(&dcb_lock);
/* Search for existing match and replace */
@@ -1635,7 +1651,7 @@ u8 dcb_setapp(struct net_device *dev, struct dcb_app *new)
}
out:
spin_unlock(&dcb_lock);
- call_dcbevent_notifiers(DCB_APP_EVENT, new);
+ call_dcbevent_notifiers(DCB_APP_EVENT, &event);
return 0;
}
EXPORT_SYMBOL(dcb_setapp);
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index e96d5e81003..fadecd20d75 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -583,6 +583,15 @@ done:
dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
}
+/*
+ * Convert RFC 3390 larger initial window into an equivalent number of packets.
+ * This is based on the numbers specified in RFC 5681, 3.1.
+ */
+static inline u32 rfc3390_bytes_to_packets(const u32 smss)
+{
+ return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
+}
+
static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
{
struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5e636365d33..06c054d5ccb 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -112,6 +112,7 @@ static int dn_dst_gc(struct dst_ops *ops);
static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
static unsigned int dn_dst_default_mtu(const struct dst_entry *dst);
+static void dn_dst_destroy(struct dst_entry *);
static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
static void dn_dst_link_failure(struct sk_buff *);
static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
@@ -133,11 +134,18 @@ static struct dst_ops dn_dst_ops = {
.check = dn_dst_check,
.default_advmss = dn_dst_default_advmss,
.default_mtu = dn_dst_default_mtu,
+ .cow_metrics = dst_cow_metrics_generic,
+ .destroy = dn_dst_destroy,
.negative_advice = dn_dst_negative_advice,
.link_failure = dn_dst_link_failure,
.update_pmtu = dn_dst_update_pmtu,
};
+static void dn_dst_destroy(struct dst_entry *dst)
+{
+ dst_destroy_metrics_generic(dst);
+}
+
static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
{
__u16 tmp = (__u16 __force)(src ^ dst);
@@ -814,14 +822,14 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
{
struct dn_fib_info *fi = res->fi;
struct net_device *dev = rt->dst.dev;
+ unsigned int mss_metric;
struct neighbour *n;
- unsigned int metric;
if (fi) {
if (DN_FIB_RES_GW(*res) &&
DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = DN_FIB_RES_GW(*res);
- dst_import_metrics(&rt->dst, fi->fib_metrics);
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
}
rt->rt_type = res->type;
@@ -834,10 +842,10 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu);
- metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
- if (metric) {
+ mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
+ if (mss_metric) {
unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
- if (metric > mss)
+ if (mss_metric > mss)
dst_metric_set(&rt->dst, RTAX_ADVMSS, mss);
}
return 0;
@@ -1114,7 +1122,7 @@ make_route:
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- rt = dst_alloc(&dn_dst_ops);
+ rt = dst_alloc(&dn_dst_ops, 0);
if (rt == NULL)
goto e_nobufs;
@@ -1375,7 +1383,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
}
make_route:
- rt = dst_alloc(&dn_dst_ops);
+ rt = dst_alloc(&dn_dst_ops, 0);
if (rt == NULL)
goto e_nobufs;
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index f2abd375569..b66600b3f4b 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -59,7 +59,6 @@ struct dn_hash
};
#define dz_key_0(key) ((key).datum = 0)
-#define dz_prefix(key,dz) ((key).datum)
#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 0c877a74e1f..3fb14b7c13c 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -428,7 +428,7 @@ static void __exit dsa_cleanup_module(void)
}
module_exit(dsa_cleanup_module);
-MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>")
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>");
MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips");
MODULE_LICENSE("GPL");
MODULE_ALIAS("platform:dsa");
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 15dcc1a586b..0c282633791 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -265,13 +265,13 @@ static void ec_tx_done(struct sk_buff *skb, int result)
static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t len)
{
- struct sock *sk = sock->sk;
struct sockaddr_ec *saddr=(struct sockaddr_ec *)msg->msg_name;
struct net_device *dev;
struct ec_addr addr;
int err;
unsigned char port, cb;
#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE)
+ struct sock *sk = sock->sk;
struct sk_buff *skb;
struct ec_cb *eb;
#endif
@@ -488,10 +488,10 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
error_free_buf:
vfree(userbuf);
+error:
#else
err = -EPROTOTYPE;
#endif
- error:
mutex_unlock(&econet_mutex);
return err;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index a5a1050595d..cbb505ba932 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
If unsure, say N here.
-choice
- prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
- depends on IP_ADVANCED_ROUTER
- default ASK_IP_FIB_HASH
-
-config ASK_IP_FIB_HASH
- bool "FIB_HASH"
- ---help---
- Current FIB is very proven and good enough for most users.
-
-config IP_FIB_TRIE
- bool "FIB_TRIE"
- ---help---
- Use new experimental LC-trie as FIB lookup algorithm.
- This improves lookup performance if you have a large
- number of routes.
-
- LC-trie is a longest matching prefix lookup algorithm which
- performs better than FIB_HASH for large routing tables.
- But, it consumes more memory and is more complex.
-
- LC-trie is described in:
-
- IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
- June 1999
-
- An experimental study of compression methods for dynamic tries
- Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
-
-endchoice
-
-config IP_FIB_HASH
- def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
-
config IP_FIB_TRIE_STATS
bool "FIB TRIE statistics"
- depends on IP_FIB_TRIE
+ depends on IP_ADVANCED_ROUTER
---help---
Keep track of statistics on structure of FIB TRIE table.
Useful for testing and measuring TRIE performance.
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE
handled by the klogd daemon which is responsible for kernel messages
("man klogd").
+config IP_ROUTE_CLASSID
+ bool
+
config IP_PNP
bool "IP: kernel level autoconfiguration"
help
@@ -657,4 +624,3 @@ config TCP_MD5SIG
on the Internet.
If unsure, say N.
-
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22f9a7..0dc772d0d12 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_minisocks.o tcp_cong.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
- fib_frontend.o fib_semantics.o \
+ fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
-obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
-obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f2b61107df6..7ceb8044763 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -880,6 +880,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
EXPORT_SYMBOL(inet_ioctl);
+#ifdef CONFIG_COMPAT
+int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = -ENOIOCTLCMD;
+
+ if (sk->sk_prot->compat_ioctl)
+ err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+
+ return err;
+}
+#endif
+
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
@@ -903,6 +916,7 @@ const struct proto_ops inet_stream_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
@@ -929,6 +943,7 @@ const struct proto_ops inet_dgram_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
@@ -959,6 +974,7 @@ static const struct proto_ops inet_sockraw_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
@@ -1215,7 +1231,7 @@ out:
return err;
}
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct iphdr *iph;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 86961bec70a..325053df6e7 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -201,7 +201,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->ttl = 0;
top_iph->check = 0;
- ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ else
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
ah->reserved = 0;
ah->spi = x->id.spi;
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
nexthdr = ah->nexthdr;
ah_hlen = (ah->hdrlen + 2) << 2;
- if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
- ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
- goto out;
+ if (x->props.flags & XFRM_STATE_ALIGN4) {
+ if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ } else {
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ }
if (!pskb_may_pull(skb, ah_hlen))
goto out;
@@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x)
BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
- x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
- ahp->icv_trunc_len);
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ else
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
if (x->props.mode == XFRM_MODE_TUNNEL)
x->props.header_len += sizeof(struct iphdr);
x->data = ahp;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 04c8b69fd42..7927589813b 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1017,14 +1017,13 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
return 0;
}
- if (__in_dev_get_rcu(dev)) {
- IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on);
+ if (__in_dev_get_rtnl(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
return 0;
}
return -ENXIO;
}
-/* must be called with rcu_read_lock() */
static int arp_req_set_public(struct net *net, struct arpreq *r,
struct net_device *dev)
{
@@ -1233,10 +1232,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!(r.arp_flags & ATF_NETMASK))
((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
htonl(0xFFFFFFFFUL);
- rcu_read_lock();
+ rtnl_lock();
if (r.arp_dev[0]) {
err = -ENODEV;
- dev = dev_get_by_name_rcu(net, r.arp_dev);
+ dev = __dev_get_by_name(net, r.arp_dev);
if (dev == NULL)
goto out;
@@ -1263,7 +1262,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
break;
}
out:
- rcu_read_unlock();
+ rtnl_unlock();
if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
err = -EFAULT;
return err;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 748cb5b337b..90389281d97 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -51,6 +51,7 @@
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/slab.h>
+#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -92,6 +93,71 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
};
+/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
+ * value. So if you change this define, make appropriate changes to
+ * inet_addr_hash as well.
+ */
+#define IN4_ADDR_HSIZE 256
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+static DEFINE_SPINLOCK(inet_addr_hash_lock);
+
+static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
+{
+ u32 val = (__force u32) addr ^ hash_ptr(net, 8);
+
+ return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
+ (IN4_ADDR_HSIZE - 1));
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+ unsigned int hash = inet_addr_hash(net, ifa->ifa_address);
+
+ spin_lock(&inet_addr_hash_lock);
+ hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+ spin_unlock(&inet_addr_hash_lock);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+ spin_lock(&inet_addr_hash_lock);
+ hlist_del_init_rcu(&ifa->hash);
+ spin_unlock(&inet_addr_hash_lock);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+ unsigned int hash = inet_addr_hash(net, addr);
+ struct net_device *result = NULL;
+ struct in_ifaddr *ifa;
+ struct hlist_node *node;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
+ struct net_device *dev = ifa->ifa_dev->dev;
+
+ if (!net_eq(dev_net(dev), net))
+ continue;
+ if (ifa->ifa_address == addr) {
+ result = dev;
+ break;
+ }
+ }
+ if (result && devref)
+ dev_hold(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -265,6 +331,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
if (!do_promote) {
+ inet_hash_remove(ifa);
*ifap1 = ifa->ifa_next;
rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -281,6 +348,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
/* 2. Unlink it */
*ifap = ifa1->ifa_next;
+ inet_hash_remove(ifa1);
/* 3. Announce address deletion */
@@ -368,6 +436,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
ifa->ifa_next = *ifap;
*ifap = ifa;
+ inet_hash_insert(dev_net(in_dev->dev), ifa);
+
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
listeners of netlink will know about new ifaddr */
@@ -521,6 +591,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
if (tb[IFA_ADDRESS] == NULL)
tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_prefixlen = ifm->ifa_prefixlen;
ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
ifa->ifa_flags = ifm->ifa_flags;
@@ -728,6 +799,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!ifa) {
ret = -ENOBUFS;
ifa = inet_alloc_ifa();
+ INIT_HLIST_NODE(&ifa->hash);
if (!ifa)
break;
if (colon)
@@ -1030,6 +1102,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu)
return mtu >= 68;
}
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+ struct in_device *in_dev)
+
+{
+ struct in_ifaddr *ifa = in_dev->ifa_list;
+
+ if (!ifa)
+ return;
+
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ifa->ifa_address, dev,
+ ifa->ifa_address, NULL,
+ dev->dev_addr, NULL);
+}
+
/* Called only under RTNL semaphore */
static int inetdev_event(struct notifier_block *this, unsigned long event,
@@ -1069,6 +1156,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
struct in_ifaddr *ifa = inet_alloc_ifa();
if (ifa) {
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
@@ -1082,18 +1170,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
}
ip_mc_up(in_dev);
/* fall through */
- case NETDEV_NOTIFY_PEERS:
case NETDEV_CHANGEADDR:
+ if (!IN_DEV_ARP_NOTIFY(in_dev))
+ break;
+ /* fall through */
+ case NETDEV_NOTIFY_PEERS:
/* Send gratuitous ARP to notify of link change */
- if (IN_DEV_ARP_NOTIFY(in_dev)) {
- struct in_ifaddr *ifa = in_dev->ifa_list;
-
- if (ifa)
- arp_send(ARPOP_REQUEST, ETH_P_ARP,
- ifa->ifa_address, dev,
- ifa->ifa_address, NULL,
- dev->dev_addr, NULL);
- }
+ inetdev_send_gratuitous_arp(dev, in_dev);
break;
case NETDEV_DOWN:
ip_mc_down(in_dev);
@@ -1710,6 +1793,11 @@ static struct rtnl_af_ops inet_af_ops = {
void __init devinet_init(void)
{
+ int i;
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
register_pernet_subsys(&devinet_ops);
register_gifconf(PF_INET, inet_gifconf);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1d2cdd43a87..ad0778a3fa5 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net)
{
struct fib_table *local_table, *main_table;
- local_table = fib_hash_table(RT_TABLE_LOCAL);
+ local_table = fib_trie_table(RT_TABLE_LOCAL);
if (local_table == NULL)
return -ENOMEM;
- main_table = fib_hash_table(RT_TABLE_MAIN);
+ main_table = fib_trie_table(RT_TABLE_MAIN);
if (main_table == NULL)
goto fail;
@@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
if (tb)
return tb;
- tb = fib_hash_table(id);
+ tb = fib_trie_table(id);
if (!tb)
return NULL;
h = id & (FIB_TABLE_HASHSZ - 1);
@@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
-void fib_select_default(struct net *net,
- const struct flowi *flp, struct fib_result *res)
-{
- struct fib_table *tb;
- int table = RT_TABLE_MAIN;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
- return;
- table = res->r->table;
-#endif
- tb = fib_get_table(net, table);
- if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- fib_table_select_default(tb, flp, res);
-}
-
static void fib_flush(struct net *net)
{
int flushed = 0;
@@ -147,46 +132,6 @@ static void fib_flush(struct net *net)
rt_cache_flush(net, -1);
}
-/**
- * __ip_dev_find - find the first device with a given source address.
- * @net: the net namespace
- * @addr: the source address
- * @devref: if true, take a reference on the found device
- *
- * If a caller uses devref=false, it should be protected by RCU, or RTNL
- */
-struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
-{
- struct flowi fl = {
- .fl4_dst = addr,
- };
- struct fib_result res = { 0 };
- struct net_device *dev = NULL;
- struct fib_table *local_table;
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
- rcu_read_lock();
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
- if (!local_table ||
- fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
- rcu_read_unlock();
- return NULL;
- }
- if (res.type != RTN_LOCAL)
- goto out;
- dev = FIB_RES_DEV(res);
-
- if (dev && devref)
- dev_hold(dev);
-out:
- rcu_read_unlock();
- return dev;
-}
-EXPORT_SYMBOL(__ip_dev_find);
-
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
@@ -1101,5 +1046,5 @@ void __init ip_fib_init(void)
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
- fib_hash_init();
+ fib_trie_init();
}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index b3acb0417b2..00000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1133 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * IPv4 FIB: lookup engine and maintenance routines.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/route.h>
-#include <net/tcp.h>
-#include <net/sock.h>
-#include <net/ip_fib.h>
-
-#include "fib_lookup.h"
-
-static struct kmem_cache *fn_hash_kmem __read_mostly;
-static struct kmem_cache *fn_alias_kmem __read_mostly;
-
-struct fib_node {
- struct hlist_node fn_hash;
- struct list_head fn_alias;
- __be32 fn_key;
- struct fib_alias fn_embedded_alias;
-};
-
-#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
-
-struct fn_zone {
- struct fn_zone __rcu *fz_next; /* Next not empty zone */
- struct hlist_head __rcu *fz_hash; /* Hash table pointer */
- seqlock_t fz_lock;
- u32 fz_hashmask; /* (fz_divisor - 1) */
-
- u8 fz_order; /* Zone order (0..32) */
- u8 fz_revorder; /* 32 - fz_order */
- __be32 fz_mask; /* inet_make_mask(order) */
-#define FZ_MASK(fz) ((fz)->fz_mask)
-
- struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
-
- int fz_nent; /* Number of entries */
- int fz_divisor; /* Hash size (mask+1) */
-};
-
-struct fn_hash {
- struct fn_zone *fn_zones[33];
- struct fn_zone __rcu *fn_zone_list;
-};
-
-static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
-{
- u32 h = ntohl(key) >> fz->fz_revorder;
- h ^= (h>>20);
- h ^= (h>>10);
- h ^= (h>>5);
- h &= fz->fz_hashmask;
- return h;
-}
-
-static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
-{
- return dst & FZ_MASK(fz);
-}
-
-static unsigned int fib_hash_genid;
-
-#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
-
-static struct hlist_head *fz_hash_alloc(int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- return kzalloc(size, GFP_KERNEL);
-
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
-}
-
-/* The fib hash lock must be held when this is called. */
-static inline void fn_rebuild_zone(struct fn_zone *fz,
- struct hlist_head *old_ht,
- int old_divisor)
-{
- int i;
-
- for (i = 0; i < old_divisor; i++) {
- struct hlist_node *node, *n;
- struct fib_node *f;
-
- hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
- struct hlist_head *new_head;
-
- hlist_del_rcu(&f->fn_hash);
-
- new_head = rcu_dereference_protected(fz->fz_hash, 1) +
- fn_hash(f->fn_key, fz);
- hlist_add_head_rcu(&f->fn_hash, new_head);
- }
- }
-}
-
-static void fz_hash_free(struct hlist_head *hash, int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long)hash, get_order(size));
-}
-
-static void fn_rehash_zone(struct fn_zone *fz)
-{
- struct hlist_head *ht, *old_ht;
- int old_divisor, new_divisor;
- u32 new_hashmask;
-
- new_divisor = old_divisor = fz->fz_divisor;
-
- switch (old_divisor) {
- case EMBEDDED_HASH_SIZE:
- new_divisor *= EMBEDDED_HASH_SIZE;
- break;
- case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
- new_divisor *= (EMBEDDED_HASH_SIZE/2);
- break;
- default:
- if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
- printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
- return;
- }
- new_divisor = (old_divisor << 1);
- break;
- }
-
- new_hashmask = (new_divisor - 1);
-
-#if RT_CACHE_DEBUG >= 2
- printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
- fz->fz_order, old_divisor);
-#endif
-
- ht = fz_hash_alloc(new_divisor);
-
- if (ht) {
- struct fn_zone nfz;
-
- memcpy(&nfz, fz, sizeof(nfz));
-
- write_seqlock_bh(&fz->fz_lock);
- old_ht = rcu_dereference_protected(fz->fz_hash, 1);
- RCU_INIT_POINTER(nfz.fz_hash, ht);
- nfz.fz_hashmask = new_hashmask;
- nfz.fz_divisor = new_divisor;
- fn_rebuild_zone(&nfz, old_ht, old_divisor);
- fib_hash_genid++;
- rcu_assign_pointer(fz->fz_hash, ht);
- fz->fz_hashmask = new_hashmask;
- fz->fz_divisor = new_divisor;
- write_sequnlock_bh(&fz->fz_lock);
-
- if (old_ht != fz->fz_embedded_hash) {
- synchronize_rcu();
- fz_hash_free(old_ht, old_divisor);
- }
- }
-}
-
-static void fn_free_node_rcu(struct rcu_head *head)
-{
- struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
-
- kmem_cache_free(fn_hash_kmem, f);
-}
-
-static inline void fn_free_node(struct fib_node *f)
-{
- call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
-}
-
-static void fn_free_alias_rcu(struct rcu_head *head)
-{
- struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
-
- kmem_cache_free(fn_alias_kmem, fa);
-}
-
-static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
-{
- fib_release_info(fa->fa_info);
- if (fa == &f->fn_embedded_alias)
- fa->fa_info = NULL;
- else
- call_rcu(&fa->rcu, fn_free_alias_rcu);
-}
-
-static struct fn_zone *
-fn_new_zone(struct fn_hash *table, int z)
-{
- int i;
- struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
- if (!fz)
- return NULL;
-
- seqlock_init(&fz->fz_lock);
- fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
- fz->fz_hashmask = fz->fz_divisor - 1;
- RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
- fz->fz_order = z;
- fz->fz_revorder = 32 - z;
- fz->fz_mask = inet_make_mask(z);
-
- /* Find the first not empty zone with more specific mask */
- for (i = z + 1; i <= 32; i++)
- if (table->fn_zones[i])
- break;
- if (i > 32) {
- /* No more specific masks, we are the first. */
- rcu_assign_pointer(fz->fz_next,
- rtnl_dereference(table->fn_zone_list));
- rcu_assign_pointer(table->fn_zone_list, fz);
- } else {
- rcu_assign_pointer(fz->fz_next,
- rtnl_dereference(table->fn_zones[i]->fz_next));
- rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
- }
- table->fn_zones[z] = fz;
- fib_hash_genid++;
- return fz;
-}
-
-int fib_table_lookup(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res,
- int fib_flags)
-{
- int err;
- struct fn_zone *fz;
- struct fn_hash *t = (struct fn_hash *)tb->tb_data;
-
- rcu_read_lock();
- for (fz = rcu_dereference(t->fn_zone_list);
- fz != NULL;
- fz = rcu_dereference(fz->fz_next)) {
- struct hlist_head *head;
- struct hlist_node *node;
- struct fib_node *f;
- __be32 k;
- unsigned int seq;
-
- do {
- seq = read_seqbegin(&fz->fz_lock);
- k = fz_key(flp->fl4_dst, fz);
-
- head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- if (f->fn_key != k)
- continue;
-
- err = fib_semantic_match(&f->fn_alias,
- flp, res,
- fz->fz_order, fib_flags);
- if (err <= 0)
- goto out;
- }
- } while (read_seqretry(&fz->fz_lock, seq));
- }
- err = 1;
-out:
- rcu_read_unlock();
- return err;
-}
-
-void fib_table_select_default(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res)
-{
- int order, last_idx;
- struct hlist_node *node;
- struct fib_node *f;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fn_hash *t = (struct fn_hash *)tb->tb_data;
- struct fn_zone *fz = t->fn_zones[0];
- struct hlist_head *head;
-
- if (fz == NULL)
- return;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- rcu_read_lock();
- head = rcu_dereference(fz->fz_hash);
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
-
- fib_alias_accessed(fa);
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- }
-
- if (order <= 0 || fi == NULL) {
- tb->tb_default = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
-
- if (last_idx >= 0)
- fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
-out:
- rcu_read_unlock();
-}
-
-/* Insert node F to FZ. */
-static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
-
- hlist_add_head_rcu(&f->fn_hash, head);
-}
-
-/* Return the node in FZ matching KEY. */
-static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
- struct hlist_node *node;
- struct fib_node *f;
-
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- if (f->fn_key == key)
- return f;
- }
-
- return NULL;
-}
-
-
-static struct fib_alias *fib_fast_alloc(struct fib_node *f)
-{
- struct fib_alias *fa = &f->fn_embedded_alias;
-
- if (fa->fa_info != NULL)
- fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- return fa;
-}
-
-/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fib_node *new_f = NULL;
- struct fib_node *f;
- struct fib_alias *fa, *new_fa;
- struct fn_zone *fz;
- struct fib_info *fi;
- u8 tos = cfg->fc_tos;
- __be32 key;
- int err;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- fz = table->fn_zones[cfg->fc_dst_len];
- if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
- return -ENOBUFS;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- fi = fib_create_info(cfg);
- if (IS_ERR(fi))
- return PTR_ERR(fi);
-
- if (fz->fz_nent > (fz->fz_divisor<<1) &&
- fz->fz_divisor < FZ_MAX_DIVISOR &&
- (cfg->fc_dst_len == 32 ||
- (1 << cfg->fc_dst_len) > fz->fz_divisor))
- fn_rehash_zone(fz);
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
-
- /* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
- * exists or to the node before which we will insert new one.
- *
- * If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
- */
-
- if (fa && fa->fa_tos == tos &&
- fa->fa_info->fib_priority == fi->fib_priority) {
- struct fib_alias *fa_first, *fa_match;
-
- err = -EEXIST;
- if (cfg->fc_nlflags & NLM_F_EXCL)
- goto out;
-
- /* We have 2 goals:
- * 1. Find exact match for type, scope, fib_info to avoid
- * duplicate routes
- * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
- */
- fa_match = NULL;
- fa_first = fa;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- if (fa->fa_tos != tos)
- break;
- if (fa->fa_info->fib_priority != fi->fib_priority)
- break;
- if (fa->fa_type == cfg->fc_type &&
- fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi) {
- fa_match = fa;
- break;
- }
- }
-
- if (cfg->fc_nlflags & NLM_F_REPLACE) {
- u8 state;
-
- fa = fa_first;
- if (fa_match) {
- if (fa == fa_match)
- err = 0;
- goto out;
- }
- err = -ENOBUFS;
- new_fa = fib_fast_alloc(f);
- if (new_fa == NULL)
- goto out;
-
- new_fa->fa_tos = fa->fa_tos;
- new_fa->fa_info = fi;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- state = fa->fa_state;
- new_fa->fa_state = state & ~FA_S_ACCESSED;
- fib_hash_genid++;
- list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
-
- fn_free_alias(fa, f);
- if (state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
- return 0;
- }
-
- /* Error if we find a perfect match which
- * uses the same scope, type, and nexthop
- * information.
- */
- if (fa_match)
- goto out;
-
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
- fa = fa_first;
- }
-
- err = -ENOENT;
- if (!(cfg->fc_nlflags & NLM_F_CREATE))
- goto out;
-
- err = -ENOBUFS;
-
- if (!f) {
- new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
- if (new_f == NULL)
- goto out;
-
- INIT_HLIST_NODE(&new_f->fn_hash);
- INIT_LIST_HEAD(&new_f->fn_alias);
- new_f->fn_key = key;
- f = new_f;
- }
-
- new_fa = fib_fast_alloc(f);
- if (new_fa == NULL)
- goto out;
-
- new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- new_fa->fa_state = 0;
-
- /*
- * Insert new entry to the list.
- */
-
- if (new_f)
- fib_insert_node(fz, new_f);
- list_add_tail_rcu(&new_fa->fa_list,
- (fa ? &fa->fa_list : &f->fn_alias));
- fib_hash_genid++;
-
- if (new_f)
- fz->fz_nent++;
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo, 0);
- return 0;
-
-out:
- if (new_f)
- kmem_cache_free(fn_hash_kmem, new_f);
- fib_release_info(fi);
- return err;
-}
-
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *)tb->tb_data;
- struct fib_node *f;
- struct fib_alias *fa, *fa_to_delete;
- struct fn_zone *fz;
- __be32 key;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
- return -ESRCH;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
- if (!fa)
- return -ESRCH;
-
- fa_to_delete = NULL;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fa->fa_tos != cfg->fc_tos)
- break;
-
- if ((!cfg->fc_type ||
- fa->fa_type == cfg->fc_type) &&
- (cfg->fc_scope == RT_SCOPE_NOWHERE ||
- fa->fa_scope == cfg->fc_scope) &&
- (!cfg->fc_protocol ||
- fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi) == 0) {
- fa_to_delete = fa;
- break;
- }
- }
-
- if (fa_to_delete) {
- int kill_fn;
-
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo, 0);
-
- kill_fn = 0;
- list_del_rcu(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del_rcu(&f->fn_hash);
- kill_fn = 1;
- }
- fib_hash_genid++;
-
- if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- fn_free_alias(fa, f);
- if (kill_fn) {
- fn_free_node(f);
- fz->fz_nent--;
- }
-
- return 0;
- }
- return -ESRCH;
-}
-
-static int fn_flush_list(struct fn_zone *fz, int idx)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
- struct hlist_node *node, *n;
- struct fib_node *f;
- int found = 0;
-
- hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
- struct fib_alias *fa, *fa_node;
- int kill_f;
-
- kill_f = 0;
- list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- list_del_rcu(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del_rcu(&f->fn_hash);
- kill_f = 1;
- }
- fib_hash_genid++;
-
- fn_free_alias(fa, f);
- found++;
- }
- }
- if (kill_f) {
- fn_free_node(f);
- fz->fz_nent--;
- }
- }
- return found;
-}
-
-/* caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz;
- int found = 0;
-
- for (fz = rtnl_dereference(table->fn_zone_list);
- fz != NULL;
- fz = rtnl_dereference(fz->fz_next)) {
- int i;
-
- for (i = fz->fz_divisor - 1; i >= 0; i--)
- found += fn_flush_list(fz, i);
- }
- return found;
-}
-
-void fib_free_table(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz, *next;
-
- next = table->fn_zone_list;
- while (next != NULL) {
- fz = next;
- next = fz->fz_next;
-
- if (fz->fz_hash != fz->fz_embedded_hash)
- fz_hash_free(fz->fz_hash, fz->fz_divisor);
-
- kfree(fz);
- }
-
- kfree(tb);
-}
-
-static inline int
-fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz,
- struct hlist_head *head)
-{
- struct hlist_node *node;
- struct fib_node *f;
- int i, s_i;
-
- s_i = cb->args[4];
- i = 0;
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
- if (i < s_i)
- goto next;
-
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id,
- fa->fa_type,
- fa->fa_scope,
- f->fn_key,
- fz->fz_order,
- fa->fa_tos,
- fa->fa_info,
- NLM_F_MULTI) < 0) {
- cb->args[4] = i;
- return -1;
- }
-next:
- i++;
- }
- }
- cb->args[4] = i;
- return skb->len;
-}
-
-static inline int
-fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz)
-{
- int h, s_h;
- struct hlist_head *head = rcu_dereference(fz->fz_hash);
-
- if (head == NULL)
- return skb->len;
- s_h = cb->args[3];
- for (h = s_h; h < fz->fz_divisor; h++) {
- if (hlist_empty(head + h))
- continue;
- if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
- cb->args[3] = h;
- return -1;
- }
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
- }
- cb->args[3] = h;
- return skb->len;
-}
-
-int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
-{
- int m = 0, s_m;
- struct fn_zone *fz;
- struct fn_hash *table = (struct fn_hash *)tb->tb_data;
-
- s_m = cb->args[2];
- rcu_read_lock();
- for (fz = rcu_dereference(table->fn_zone_list);
- fz != NULL;
- fz = rcu_dereference(fz->fz_next), m++) {
- if (m < s_m)
- continue;
- if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
- cb->args[2] = m;
- rcu_read_unlock();
- return -1;
- }
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
- }
- rcu_read_unlock();
- cb->args[2] = m;
- return skb->len;
-}
-
-void __init fib_hash_init(void)
-{
- fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
- 0, SLAB_PANIC, NULL);
-
- fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
- 0, SLAB_PANIC, NULL);
-
-}
-
-struct fib_table *fib_hash_table(u32 id)
-{
- struct fib_table *tb;
-
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
- GFP_KERNEL);
- if (tb == NULL)
- return NULL;
-
- tb->tb_id = id;
- tb->tb_default = -1;
-
- memset(tb->tb_data, 0, sizeof(struct fn_hash));
- return tb;
-}
-
-/* ------------------------------------------------------------------------ */
-#ifdef CONFIG_PROC_FS
-
-struct fib_iter_state {
- struct seq_net_private p;
- struct fn_zone *zone;
- int bucket;
- struct hlist_head *hash_head;
- struct fib_node *fn;
- struct fib_alias *fa;
- loff_t pos;
- unsigned int genid;
- int valid;
-};
-
-static struct fib_alias *fib_get_first(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_table *main_table;
- struct fn_hash *table;
-
- main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
- table = (struct fn_hash *)main_table->tb_data;
-
- iter->bucket = 0;
- iter->hash_head = NULL;
- iter->fn = NULL;
- iter->fa = NULL;
- iter->pos = 0;
- iter->genid = fib_hash_genid;
- iter->valid = 1;
-
- for (iter->zone = rcu_dereference(table->fn_zone_list);
- iter->zone != NULL;
- iter->zone = rcu_dereference(iter->zone->fz_next)) {
- int maxslot;
-
- if (!iter->zone->fz_nent)
- continue;
-
- iter->hash_head = rcu_dereference(iter->zone->fz_hash);
- maxslot = iter->zone->fz_divisor;
-
- for (iter->bucket = 0; iter->bucket < maxslot;
- ++iter->bucket, ++iter->hash_head) {
- struct hlist_node *node;
- struct fib_node *fn;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
- }
-out:
- return iter->fa;
-}
-
-static struct fib_alias *fib_get_next(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_node *fn;
- struct fib_alias *fa;
-
- /* Advance FA, if any. */
- fn = iter->fn;
- fa = iter->fa;
- if (fa) {
- BUG_ON(!fn);
- list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
-
- fa = iter->fa = NULL;
-
- /* Advance FN. */
- if (fn) {
- struct hlist_node *node = &fn->fn_hash;
- hlist_for_each_entry_continue(fn, node, fn_hash) {
- iter->fn = fn;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- fn = iter->fn = NULL;
-
- /* Advance hash chain. */
- if (!iter->zone)
- goto out;
-
- for (;;) {
- struct hlist_node *node;
- int maxslot;
-
- maxslot = iter->zone->fz_divisor;
-
- while (++iter->bucket < maxslot) {
- iter->hash_head++;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- iter->zone = rcu_dereference(iter->zone->fz_next);
-
- if (!iter->zone)
- goto out;
-
- iter->bucket = 0;
- iter->hash_head = rcu_dereference(iter->zone->fz_hash);
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-out:
- iter->pos++;
- return fa;
-}
-
-static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_alias *fa;
-
- if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
- fa = iter->fa;
- pos -= iter->pos;
- } else
- fa = fib_get_first(seq);
-
- if (fa)
- while (pos && (fa = fib_get_next(seq)))
- --pos;
- return pos ? NULL : fa;
-}
-
-static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
-{
- void *v = NULL;
-
- rcu_read_lock();
- if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
- v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
- return v;
-}
-
-static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
-}
-
-static void fib_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
-{
- static const unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT,
- [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
-
- if (fi && fi->fib_nh->nh_gw)
- flags |= RTF_GATEWAY;
- if (mask == htonl(0xFFFFFFFF))
- flags |= RTF_HOST;
- flags |= RTF_UP;
- return flags;
-}
-
-/*
- * This outputs /proc/net/route.
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
- */
-static int fib_seq_show(struct seq_file *seq, void *v)
-{
- struct fib_iter_state *iter;
- int len;
- __be32 prefix, mask;
- unsigned flags;
- struct fib_node *f;
- struct fib_alias *fa;
- struct fib_info *fi;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
- "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
- "\tWindow\tIRTT");
- goto out;
- }
-
- iter = seq->private;
- f = iter->fn;
- fa = iter->fa;
- fi = fa->fa_info;
- prefix = f->fn_key;
- mask = FZ_MASK(iter->zone);
- flags = fib_flag_trans(fa->fa_type, mask, fi);
- if (fi)
- seq_printf(seq,
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
- fi->fib_dev ? fi->fib_dev->name : "*", prefix,
- fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
- mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3, &len);
- else
- seq_printf(seq,
- "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
- prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
-
- seq_printf(seq, "%*s\n", 127 - len, "");
-out:
- return 0;
-}
-
-static const struct seq_operations fib_seq_ops = {
- .start = fib_seq_start,
- .next = fib_seq_next,
- .stop = fib_seq_stop,
- .show = fib_seq_show,
-};
-
-static int fib_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &fib_seq_ops,
- sizeof(struct fib_iter_state));
-}
-
-static const struct file_operations fib_seq_fops = {
- .owner = THIS_MODULE,
- .open = fib_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-int __net_init fib_proc_init(struct net *net)
-{
- if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
- return -ENOMEM;
- return 0;
-}
-
-void __net_exit fib_proc_exit(struct net *net)
-{
- proc_net_remove(net, "route");
-}
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c079cc0ec65..d5c40d8f663 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -25,7 +25,7 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
}
/* Exported by fib_semantics.c */
-extern int fib_semantic_match(struct list_head *head,
+extern int fib_semantic_match(struct fib_table *tb, struct list_head *head,
const struct flowi *flp,
struct fib_result *res, int prefixlen, int fib_flags);
extern void fib_release_info(struct fib_info *);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 7981a24f5c7..3018efbaea7 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -41,13 +41,13 @@ struct fib4_rule {
__be32 srcmask;
__be32 dst;
__be32 dstmask;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
u32 tclassid;
#endif
};
-#ifdef CONFIG_NET_CLS_ROUTE
-u32 fib_rules_tclass(struct fib_result *res)
+#ifdef CONFIG_IP_ROUTE_CLASSID
+u32 fib_rules_tclass(const struct fib_result *res)
{
return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
}
@@ -165,7 +165,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (frh->dst_len)
rule4->dst = nla_get_be32(tb[FRA_DST]);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW])
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
#endif
@@ -195,7 +195,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->tos && (rule4->tos != frh->tos))
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
#endif
@@ -224,7 +224,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
if (rule4->src_len)
NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (rule4->tclassid)
NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
#endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 12d3dc3df1b..562f34cd930 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
static DEFINE_SPINLOCK(fib_info_lock);
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_hash_size;
+static unsigned int fib_info_hash_size;
static unsigned int fib_info_cnt;
#define DEVINDEX_HASHBITS 8
@@ -152,6 +152,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
+ if (fi->fib_metrics != (u32 *) dst_default_metrics)
+ kfree(fi->fib_metrics);
kfree(fi);
}
@@ -200,7 +202,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
nh->nh_weight != onh->nh_weight ||
#endif
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
@@ -221,7 +223,7 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
unsigned int val = fi->fib_nhs;
val ^= fi->fib_protocol;
@@ -422,7 +424,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
#endif
@@ -476,7 +478,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla && nla_get_be32(nla) != nh->nh_gw)
return 1;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
if (nla && nla_get_u32(nla) != nh->nh_tclassid)
return 1;
@@ -613,14 +615,14 @@ out:
static inline unsigned int fib_laddr_hashfn(__be32 val)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
return ((__force u32)val ^
((__force u32)val >> 7) ^
((__force u32)val >> 14)) & mask;
}
-static struct hlist_head *fib_hash_alloc(int bytes)
+static struct hlist_head *fib_info_hash_alloc(int bytes)
{
if (bytes <= PAGE_SIZE)
return kzalloc(bytes, GFP_KERNEL);
@@ -630,7 +632,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)
get_order(bytes));
}
-static void fib_hash_free(struct hlist_head *hash, int bytes)
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
{
if (!hash)
return;
@@ -641,18 +643,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
free_pages((unsigned long) hash, get_order(bytes));
}
-static void fib_hash_move(struct hlist_head *new_info_hash,
- struct hlist_head *new_laddrhash,
- unsigned int new_size)
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+ struct hlist_head *new_laddrhash,
+ unsigned int new_size)
{
struct hlist_head *old_info_hash, *old_laddrhash;
- unsigned int old_size = fib_hash_size;
+ unsigned int old_size = fib_info_hash_size;
unsigned int i, bytes;
spin_lock_bh(&fib_info_lock);
old_info_hash = fib_info_hash;
old_laddrhash = fib_info_laddrhash;
- fib_hash_size = new_size;
+ fib_info_hash_size = new_size;
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &fib_info_hash[i];
@@ -693,8 +695,8 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
spin_unlock_bh(&fib_info_lock);
bytes = old_size * sizeof(struct hlist_head *);
- fib_hash_free(old_info_hash, bytes);
- fib_hash_free(old_laddrhash, bytes);
+ fib_info_hash_free(old_info_hash, bytes);
+ fib_info_hash_free(old_laddrhash, bytes);
}
struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -718,8 +720,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
#endif
err = -ENOBUFS;
- if (fib_info_cnt >= fib_hash_size) {
- unsigned int new_size = fib_hash_size << 1;
+ if (fib_info_cnt >= fib_info_hash_size) {
+ unsigned int new_size = fib_info_hash_size << 1;
struct hlist_head *new_info_hash;
struct hlist_head *new_laddrhash;
unsigned int bytes;
@@ -727,21 +729,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (!new_size)
new_size = 1;
bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_hash_alloc(bytes);
- new_laddrhash = fib_hash_alloc(bytes);
+ new_info_hash = fib_info_hash_alloc(bytes);
+ new_laddrhash = fib_info_hash_alloc(bytes);
if (!new_info_hash || !new_laddrhash) {
- fib_hash_free(new_info_hash, bytes);
- fib_hash_free(new_laddrhash, bytes);
+ fib_info_hash_free(new_info_hash, bytes);
+ fib_info_hash_free(new_laddrhash, bytes);
} else
- fib_hash_move(new_info_hash, new_laddrhash, new_size);
+ fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
- if (!fib_hash_size)
+ if (!fib_info_hash_size)
goto failure;
}
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
if (fi == NULL)
goto failure;
+ if (cfg->fc_mx) {
+ fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ if (!fi->fib_metrics)
+ goto failure;
+ } else
+ fi->fib_metrics = (u32 *) dst_default_metrics;
fib_info_cnt++;
fi->fib_net = hold_net(net);
@@ -779,7 +787,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto err_inval;
if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
goto err_inval;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
goto err_inval;
#endif
@@ -792,7 +800,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
nh->nh_oif = cfg->fc_oif;
nh->nh_gw = cfg->fc_gw;
nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid = cfg->fc_flow;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -881,8 +889,9 @@ failure:
}
/* Note! fib_semantic_match intentionally uses RCU list functions. */
-int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, int prefixlen, int fib_flags)
+int fib_semantic_match(struct fib_table *tb, struct list_head *head,
+ const struct flowi *flp, struct fib_result *res,
+ int prefixlen, int fib_flags)
{
struct fib_alias *fa;
int nh_sel = 0;
@@ -946,6 +955,8 @@ out_fill_res:
res->type = fa->fa_type;
res->scope = fa->fa_scope;
res->fi = fa->fa_info;
+ res->table = tb;
+ res->fa_head = head;
if (!(fib_flags & FIB_LOOKUP_NOREF))
atomic_inc(&res->fi->fib_clntref);
return 0;
@@ -1002,7 +1013,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
if (fi->fib_nh->nh_oif)
NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (fi->fib_nh[0].nh_tclassid)
NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
#endif
@@ -1027,7 +1038,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
if (nh->nh_gw)
NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (nh->nh_tclassid)
NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
#endif
@@ -1125,6 +1136,62 @@ int fib_sync_down_dev(struct net_device *dev, int force)
return ret;
}
+/* Must be invoked inside of an RCU protected region. */
+void fib_select_default(struct fib_result *res)
+{
+ struct fib_info *fi = NULL, *last_resort = NULL;
+ struct list_head *fa_head = res->fa_head;
+ struct fib_table *tb = res->table;
+ int order = -1, last_idx = -1;
+ struct fib_alias *fa;
+
+ list_for_each_entry_rcu(fa, fa_head, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (fa->fa_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+
+ fib_alias_accessed(fa);
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+
+ if (order <= 0 || fi == NULL) {
+ tb->tb_default = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ tb->tb_default = last_idx;
+out:
+ return;
+}
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/*
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0f280348e0f..edf3b0997e0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -95,7 +95,7 @@ typedef unsigned int t_key;
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
-struct node {
+struct rt_trie_node {
unsigned long parent;
t_key key;
};
@@ -126,7 +126,7 @@ struct tnode {
struct work_struct work;
struct tnode *tnode_free;
};
- struct node *child[0];
+ struct rt_trie_node *child[0];
};
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +151,16 @@ struct trie_stat {
};
struct trie {
- struct node *trie;
+ struct rt_trie_node *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
};
-static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull);
-static struct node *resize(struct trie *t, struct tnode *tn);
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
@@ -177,12 +177,12 @@ static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
-static inline struct tnode *node_parent(struct node *node)
+static inline struct tnode *node_parent(struct rt_trie_node *node)
{
return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
}
-static inline struct tnode *node_parent_rcu(struct node *node)
+static inline struct tnode *node_parent_rcu(struct rt_trie_node *node)
{
struct tnode *ret = node_parent(node);
@@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node)
/* Same as rcu_assign_pointer
* but that macro() assumes that value is a pointer.
*/
-static inline void node_set_parent(struct node *node, struct tnode *ptr)
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
{
smp_wmb();
node->parent = (unsigned long)ptr | NODE_TYPE(node);
}
-static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
+static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i)
{
BUG_ON(i >= 1U << tn->bits);
return tn->child[i];
}
-static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
+static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
{
- struct node *ret = tnode_get_child(tn, i);
+ struct rt_trie_node *ret = tnode_get_child(tn, i);
return rcu_dereference_rtnl(ret);
}
@@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn)
return 1 << tn->bits;
}
-static inline t_key mask_pfx(t_key k, unsigned short l)
+static inline t_key mask_pfx(t_key k, unsigned int l)
{
return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}
-static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
+static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
{
if (offset < KEYLENGTH)
return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head)
{
struct tnode *tn = container_of(head, struct tnode, rcu);
size_t size = sizeof(struct tnode) +
- (sizeof(struct node *) << tn->bits);
+ (sizeof(struct rt_trie_node *) << tn->bits);
if (size <= PAGE_SIZE)
kfree(tn);
@@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn)
tn->tnode_free = tnode_free_head;
tnode_free_head = tn;
tnode_free_size += sizeof(struct tnode) +
- (sizeof(struct node *) << tn->bits);
+ (sizeof(struct rt_trie_node *) << tn->bits);
}
static void tnode_free_flush(void)
@@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen)
static struct tnode *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
+ size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
struct tnode *tn = tnode_alloc(sz);
if (tn) {
@@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct node) << bits);
+ sizeof(struct rt_trie_node) << bits);
return tn;
}
@@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
{
if (n == NULL || IS_LEAF(n))
return 0;
@@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
}
static inline void put_child(struct trie *t, struct tnode *tn, int i,
- struct node *n)
+ struct rt_trie_node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
@@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull)
{
- struct node *chi = tn->child[i];
+ struct rt_trie_node *chi = tn->child[i];
int isfull;
BUG_ON(i >= 1<<tn->bits);
@@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
}
#define MAX_WORK 10
-static struct node *resize(struct trie *t, struct tnode *tn)
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
{
int i;
struct tnode *old_tn;
@@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Keep root node larger */
- if (!node_parent((struct node *)tn)) {
+ if (!node_parent((struct rt_trie_node *)tn)) {
inflate_threshold_use = inflate_threshold_root;
halve_threshold_use = halve_threshold_root;
} else {
@@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return (struct node *) tn;
+ return (struct rt_trie_node *) tn;
/*
* Halve as long as the number of empty children in this
@@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
for (i = 0; i < tnode_child_length(tn); i++) {
- struct node *n;
+ struct rt_trie_node *n;
n = tn->child[i];
if (!n)
@@ -676,7 +676,7 @@ one_child:
return n;
}
}
- return (struct node *) tn;
+ return (struct rt_trie_node *) tn;
}
static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
goto nomem;
}
- put_child(t, tn, 2*i, (struct node *) left);
- put_child(t, tn, 2*i+1, (struct node *) right);
+ put_child(t, tn, 2*i, (struct rt_trie_node *) left);
+ put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
}
}
for (i = 0; i < olen; i++) {
struct tnode *inode;
- struct node *node = tnode_get_child(oldtnode, i);
+ struct rt_trie_node *node = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
int size, j;
@@ -825,7 +825,7 @@ nomem:
static struct tnode *halve(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn;
- struct node *left, *right;
+ struct rt_trie_node *left, *right;
int i;
int olen = tnode_child_length(tn);
@@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (!newn)
goto nomem;
- put_child(t, tn, i/2, (struct node *)newn);
+ put_child(t, tn, i/2, (struct rt_trie_node *)newn);
}
}
@@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key)
{
int pos;
struct tnode *tn;
- struct node *n;
+ struct rt_trie_node *n;
pos = 0;
n = rcu_dereference_rtnl(t->trie);
@@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
key = tn->key;
- while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
+ while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = (struct tnode *) resize(t, (struct tnode *)tn);
tnode_put_child_reorg((struct tnode *)tp, cindex,
- (struct node *)tn, wasfull);
+ (struct rt_trie_node *)tn, wasfull);
- tp = node_parent((struct node *) tn);
+ tp = node_parent((struct rt_trie_node *) tn);
if (!tp)
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
if (!tp)
@@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
if (IS_TNODE(tn))
tn = (struct tnode *)resize(t, (struct tnode *)tn);
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
}
@@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
- struct node *n;
+ struct rt_trie_node *n;
struct leaf *l;
int missbit;
struct list_head *fa_head = NULL;
@@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */
- node_set_parent((struct node *)l, tp);
+ node_set_parent((struct rt_trie_node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+ put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
@@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
return NULL;
}
- node_set_parent((struct node *)tn, tp);
+ node_set_parent((struct rt_trie_node *)tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
- put_child(t, tn, missbit, (struct node *)l);
+ put_child(t, tn, missbit, (struct rt_trie_node *)l);
put_child(t, tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex,
- (struct node *)tn);
+ (struct rt_trie_node *)tn);
} else {
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
}
}
@@ -1340,7 +1340,7 @@ err:
}
/* should be called with rcu_read_lock */
-static int check_leaf(struct trie *t, struct leaf *l,
+static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
t_key key, const struct flowi *flp,
struct fib_result *res, int fib_flags)
{
@@ -1356,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
if (l->key != (key & ntohl(mask)))
continue;
- err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
+ err = fib_semantic_match(tb, &li->falh, flp, res, plen, fib_flags);
#ifdef CONFIG_IP_FIB_TRIE_STATS
if (err <= 0)
@@ -1376,13 +1376,13 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
{
struct trie *t = (struct trie *) tb->tb_data;
int ret;
- struct node *n;
+ struct rt_trie_node *n;
struct tnode *pn;
- int pos, bits;
+ unsigned int pos, bits;
t_key key = ntohl(flp->fl4_dst);
- int chopped_off;
+ unsigned int chopped_off;
t_key cindex = 0;
- int current_prefix_length = KEYLENGTH;
+ unsigned int current_prefix_length = KEYLENGTH;
struct tnode *cn;
t_key pref_mismatch;
@@ -1398,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
/* Just a leaf? */
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
goto found;
}
@@ -1423,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
}
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found;
@@ -1541,7 +1541,7 @@ backtrace:
if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
} else {
- struct tnode *parent = node_parent_rcu((struct node *) pn);
+ struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
if (!parent)
goto failed;
@@ -1568,7 +1568,7 @@ found:
*/
static void trie_leaf_remove(struct trie *t, struct leaf *l)
{
- struct tnode *tp = node_parent((struct node *) l);
+ struct tnode *tp = node_parent((struct rt_trie_node *) l);
pr_debug("entering trie_leaf_remove(%p)\n", l);
@@ -1706,7 +1706,7 @@ static int trie_flush_leaf(struct leaf *l)
* Scan for the next right leaf starting at node p->child[idx]
* Since we have back pointer, no recursion necessary.
*/
-static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
{
do {
t_key idx;
@@ -1732,7 +1732,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
}
/* Node empty, walk back up to parent */
- c = (struct node *) p;
+ c = (struct rt_trie_node *) p;
} while ((p = node_parent_rcu(c)) != NULL);
return NULL; /* Root of trie */
@@ -1753,7 +1753,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
static struct leaf *trie_nextleaf(struct leaf *l)
{
- struct node *c = (struct node *) l;
+ struct rt_trie_node *c = (struct rt_trie_node *) l;
struct tnode *p = node_parent_rcu(c);
if (!p)
@@ -1802,80 +1802,6 @@ void fib_free_table(struct fib_table *tb)
kfree(tb);
}
-void fib_table_select_default(struct fib_table *tb,
- const struct flowi *flp,
- struct fib_result *res)
-{
- struct trie *t = (struct trie *) tb->tb_data;
- int order, last_idx;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fib_alias *fa = NULL;
- struct list_head *fa_head;
- struct leaf *l;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- rcu_read_lock();
-
- l = fib_find_node(t, 0);
- if (!l)
- goto out;
-
- fa_head = get_fa_head(l, 0);
- if (!fa_head)
- goto out;
-
- if (list_empty(fa_head))
- goto out;
-
- list_for_each_entry_rcu(fa, fa_head, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
-
- fib_alias_accessed(fa);
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- if (order <= 0 || fi == NULL) {
- tb->tb_default = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- if (last_idx >= 0)
- fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
-out:
- rcu_read_unlock();
-}
-
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
@@ -1990,7 +1916,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
return skb->len;
}
-void __init fib_hash_init(void)
+void __init fib_trie_init(void)
{
fn_alias_kmem = kmem_cache_create("ip_fib_alias",
sizeof(struct fib_alias),
@@ -2003,8 +1929,7 @@ void __init fib_hash_init(void)
}
-/* Fix more generic FIB names for init later */
-struct fib_table *fib_hash_table(u32 id)
+struct fib_table *fib_trie_table(u32 id)
{
struct fib_table *tb;
struct trie *t;
@@ -2036,7 +1961,7 @@ struct fib_trie_iter {
unsigned int depth;
};
-static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
{
struct tnode *tn = iter->tnode;
unsigned int cindex = iter->index;
@@ -2050,7 +1975,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
iter->tnode, iter->index, iter->depth);
rescan:
while (cindex < (1<<tn->bits)) {
- struct node *n = tnode_get_child_rcu(tn, cindex);
+ struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
if (n) {
if (IS_LEAF(n)) {
@@ -2069,7 +1994,7 @@ rescan:
}
/* Current node exhausted, pop back up */
- p = node_parent_rcu((struct node *)tn);
+ p = node_parent_rcu((struct rt_trie_node *)tn);
if (p) {
cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
tn = p;
@@ -2081,10 +2006,10 @@ rescan:
return NULL;
}
-static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
+static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
- struct node *n;
+ struct rt_trie_node *n;
if (!t)
return NULL;
@@ -2108,7 +2033,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct node *n;
+ struct rt_trie_node *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
@@ -2181,7 +2106,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct node *) * pointers;
+ bytes += sizeof(struct rt_trie_node *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
@@ -2262,7 +2187,7 @@ static const struct file_operations fib_triestat_fops = {
.release = single_release_net,
};
-static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2275,7 +2200,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
- struct node *n;
+ struct rt_trie_node *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
@@ -2304,7 +2229,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
- struct node *n;
+ struct rt_trie_node *n;
++*pos;
/* next node in same table */
@@ -2390,7 +2315,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct node *n = v;
+ struct rt_trie_node *n = v;
if (!node_parent_rcu(n))
fib_table_print(seq, iter->tb);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4aa1b7f01ea..ad2bcf1b69a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
* Send an ICMP frame.
*/
-/*
- * Check transmit rate limitation for given message.
- * The rate information is held in the destination cache now.
- * This function is generic and could be used for other purposes
- * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
- *
- * Note that the same dst_entry fields are modified by functions in
- * route.c too, but these work for packet destinations while xrlim_allow
- * works for icmp destinations. This means the rate limiting information
- * for one "ip object" is shared - and these ICMPs are twice limited:
- * by source and by destination.
- *
- * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
- * SHOULD allow setting of rate limits
- *
- * Shared between ICMPv4 and ICMPv6.
- */
-#define XRLIM_BURST_FACTOR 6
-int xrlim_allow(struct dst_entry *dst, int timeout)
-{
- unsigned long now, token = dst->rate_tokens;
- int rc = 0;
-
- now = jiffies;
- token += now - dst->rate_last;
- dst->rate_last = now;
- if (token > XRLIM_BURST_FACTOR * timeout)
- token = XRLIM_BURST_FACTOR * timeout;
- if (token >= timeout) {
- token -= timeout;
- rc = 1;
- }
- dst->rate_tokens = token;
- return rc;
-}
-EXPORT_SYMBOL(xrlim_allow);
-
-static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
int type, int code)
{
struct dst_entry *dst = &rt->dst;
- int rc = 1;
+ bool rc = true;
if (type > NR_ICMP_TYPES)
goto out;
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
goto out;
/* Limit if icmp type is enabled in ratemask. */
- if ((1 << type) & net->ipv4.sysctl_icmp_ratemask)
- rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit);
+ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ rc = inet_peer_xrlim_allow(rt->peer,
+ net->ipv4.sysctl_icmp_ratelimit);
+ }
out:
return rc;
}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 2746c1fa641..2ada17129fc 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -858,7 +858,7 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
nlmsg_len(nlh) < hdrlen)
return -EINVAL;
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(nlh, hdrlen)) {
struct nlattr *attr;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c5af909cf70..3c8dfa16614 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -505,7 +505,9 @@ restart:
}
rcu_read_unlock();
+ local_bh_disable();
inet_twsk_deschedule(tw, twdr);
+ local_bh_enable();
inet_twsk_put(tw);
goto restart_rcu;
}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d9bc85751c7..48f8d4592cc 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a,
int i, n = (a->family == AF_INET ? 1 : 4);
for (i = 0; i < n; i++) {
- if (a->a6[i] == b->a6[i])
+ if (a->addr.a6[i] == b->addr.a6[i])
continue;
- if (a->a6[i] < b->a6[i])
+ if (a->addr.a6[i] < b->addr.a6[i])
return -1;
return 1;
}
@@ -475,7 +475,7 @@ static int cleanup_once(unsigned long ttl)
struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
{
struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
- struct inet_peer_base *base = family_to_base(AF_INET);
+ struct inet_peer_base *base = family_to_base(daddr->family);
struct inet_peer *p;
/* Look up for the address quickly, lockless.
@@ -510,8 +510,13 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
p->daddr = *daddr;
atomic_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
- atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4));
+ atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
p->tcp_ts_stamp = 0;
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ p->rate_last = 0;
+ p->pmtu_expires = 0;
+ memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
INIT_LIST_HEAD(&p->unused);
@@ -579,3 +584,44 @@ void inet_putpeer(struct inet_peer *p)
local_bh_enable();
}
EXPORT_SYMBOL_GPL(inet_putpeer);
+
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the inet_peer entries now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same inet_peer fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+ unsigned long now, token;
+ bool rc = false;
+
+ if (!peer)
+ return true;
+
+ token = peer->rate_tokens;
+ now = jiffies;
+ token += now - peer->rate_last;
+ peer->rate_last = now;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ if (token >= timeout) {
+ token -= timeout;
+ rc = true;
+ }
+ peer->rate_tokens = token;
+ return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index eb68a0e34e4..6613edfac28 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -775,6 +775,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
.fl4_dst = dst,
.fl4_src = tiph->saddr,
.fl4_tos = RT_TOS(tos),
+ .proto = IPPROTO_GRE,
.fl_gre_key = tunnel->parms.o_key
};
if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb..d7b2b0987a3 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
}
}
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3f3a9afd73e..8b65a12654e 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
#include <net/ipip.h>
#include <net/checksum.h>
#include <net/netlink.h>
@@ -1434,6 +1435,81 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
}
}
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req {
+ struct in_addr src;
+ struct in_addr grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_vif_req {
+ vifi_t vifi; /* Which iface */
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req sr;
+ struct compat_sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (mrt == NULL)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.vifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif_table[vr.vifi];
+ if (VIF_EXISTS(mrt, vr.vifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2bae5..f926a310075 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -206,8 +206,9 @@ config IP_NF_TARGET_REDIRECT
config NF_NAT_SNMP_BASIC
tristate "Basic SNMP-ALG support"
- depends on NF_NAT
+ depends on NF_CONNTRACK_SNMP && NF_NAT
depends on NETFILTER_ADVANCED
+ default NF_NAT && NF_CONNTRACK_SNMP
---help---
This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e855fffaed9..e95054c690c 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(NFPROTO_ARP, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -1333,6 +1334,7 @@ static int translate_compat_table(const char *name,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(NFPROTO_ARP);
+ xt_compat_init_offsets(NFPROTO_ARP, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index b8ddcc480ed..a5e52a9f0a1 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -60,12 +60,12 @@ static int checkentry(const struct xt_tgchk_param *par)
if (mangle->flags & ~ARPT_MANGLE_MASK ||
!(mangle->flags & ARPT_MANGLE_MASK))
- return false;
+ return -EINVAL;
if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
mangle->target != XT_CONTINUE)
- return false;
- return true;
+ return -EINVAL;
+ return 0;
}
static struct xt_target arpt_mangle_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 652efea013d..ef7d7b9680e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(AF_INET, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -1664,6 +1665,7 @@ translate_compat_table(struct net *net,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET);
+ xt_compat_init_offsets(AF_INET, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a489765..403ca57f601 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
* that the ->target() function isn't called after ->destroy() */
ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL) {
- pr_info("no conntrack!\n");
- /* FIXME: need to drop invalid ones, since replies
- * to outgoing connections of other nodes will be
- * marked as INVALID */
+ if (ct == NULL)
return NF_DROP;
- }
/* special case: ICMP error handling. conntrack distinguishes between
* error messages (RELATED) and information requests (see below) */
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8fda2e..d76d6c9ed94 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf,
}
#endif
- /* MAC logging for input path only. */
- if (in && !out)
+ if (in != NULL)
dump_mac_header(m, loginfo, skb);
dump_packet(m, loginfo, skb, 0);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f29..aef5d1fbe77 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
dev_net(out)->ipv4.iptable_mangle);
/* Reroute for ANY change. */
- if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
+ if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 63f60fc5d26..5585980fce2 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -20,6 +20,7 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_acct.h>
+#include <linux/rculist_nulls.h>
struct ct_iter_state {
struct seq_net_private p;
@@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket++) {
- n = rcu_dereference(net->ct.hash[st->bucket].first);
+ n = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
- head = rcu_dereference(net->ct.hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
}
return head;
}
@@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ n = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
if (n)
return n;
}
@@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
}
return head;
}
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 0f23b3f06df..703f366fd23 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb,
/* Try to get same port: if not, try to change it. */
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
+ int res;
exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
+ res = nf_ct_expect_related(exp);
+ if (res == 0)
break;
- else if (ret != -EBUSY) {
+ else if (res != -EBUSY) {
port = 0;
break;
}
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787ce1a7..21bcf471b25 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
manips not an issue. */
if (maniptype == IP_NAT_MANIP_SRC &&
!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
- if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
+ /* try the original tuple first */
+ if (in_range(orig_tuple, range)) {
+ if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ *tuple = *orig_tuple;
+ return;
+ }
+ } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
+ range)) {
pr_debug("get_unique_tuple: Found current src map\n");
if (!nf_nat_used_tuple(tuple, ct))
return;
@@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
struct nf_conn_nat *nat;
- int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
/* nat helper or nfctnetlink also setup binding */
nat = nfct_nat(ct);
@@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
ct->status |= IPS_DST_NAT;
}
- /* Place in source hash if this is the first time. */
- if (have_to_hash) {
+ if (maniptype == IP_NAT_MANIP_SRC) {
unsigned int srchash;
srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
/* It's done. */
if (maniptype == IP_NAT_MANIP_DST)
- set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
+ ct->status |= IPS_DST_NAT_DONE;
else
- set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
+ ct->status |= IPS_SRC_NAT_DONE;
return NF_ACCEPT;
}
@@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
int ret = 0;
spin_lock_bh(&nf_nat_lock);
- if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
+ if (rcu_dereference_protected(
+ nf_nat_protos[proto->protonum],
+ lockdep_is_held(&nf_nat_lock)
+ ) != &nf_nat_unknown_protocol) {
ret = -EBUSY;
goto out;
}
@@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
if (nat == NULL || nat->ct == NULL)
return;
- NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);
+ NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
spin_lock_bh(&nf_nat_lock);
hlist_del_rcu(&nat->bysource);
@@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
struct nf_conn_nat *old_nat = old;
struct nf_conn *ct = old_nat->ct;
- if (!ct || !(ct->status & IPS_NAT_DONE_MASK))
+ if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
return;
spin_lock_bh(&nf_nat_lock);
- new_nat->ct = ct;
hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
spin_unlock_bh(&nf_nat_lock);
}
@@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
{
/* Leave them the same for the moment. */
net->ipv4.nat_htable_size = net->ct.htable_size;
- net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
- &net->ipv4.nat_vmalloced, 0);
+ net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
if (!net->ipv4.nat_bysource)
return -ENOMEM;
return 0;
@@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
{
nf_ct_iterate_cleanup(net, &clean_nat, NULL);
synchronize_rcu();
- nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
- net->ipv4.nat_htable_size);
+ nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
}
static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a5..8812a02078a 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
{
int ret = 0;
- ret = nf_conntrack_helper_register(&snmp_helper);
- if (ret < 0)
- return ret;
+ BUG_ON(nf_nat_snmp_hook != NULL);
+ rcu_assign_pointer(nf_nat_snmp_hook, help);
+
ret = nf_conntrack_helper_register(&snmp_trap_helper);
if (ret < 0) {
nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
static void __exit nf_nat_snmp_basic_fini(void)
{
- nf_conntrack_helper_unregister(&snmp_helper);
+ rcu_assign_pointer(nf_nat_snmp_hook, NULL);
nf_conntrack_helper_unregister(&snmp_trap_helper);
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a3d5ab786e8..6390ba299b3 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -76,6 +76,7 @@
#include <linux/seq_file.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
static struct raw_hashinfo raw_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
@@ -838,6 +839,23 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
}
+#ifdef CONFIG_COMPAT
+static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+#endif
+
struct proto raw_prot = {
.name = "RAW",
.owner = THIS_MODULE,
@@ -860,6 +878,7 @@ struct proto raw_prot = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_raw_setsockopt,
.compat_getsockopt = compat_raw_getsockopt,
+ .compat_ioctl = compat_raw_ioctl,
#endif
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 351dc4e8524..52b077d4520 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
static int rt_chain_length_max __read_mostly = 20;
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
-
/*
* Interface to generic destination cache.
*/
@@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
{
}
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ struct inet_peer *peer;
+ u32 *p = NULL;
+
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+
+ peer = rt->peer;
+ if (peer) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ p = peer->metrics;
+ if (inet_metrics_new(peer))
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ } else {
+ if (rt->fi) {
+ fib_info_put(rt->fi);
+ rt->fi = NULL;
+ }
+ }
+ }
+ return p;
+}
+
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
@@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = {
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
.default_mtu = ipv4_default_mtu,
+ .cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
@@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
.release = seq_release,
};
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
struct ip_rt_acct *dst, *src;
@@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
if (!pde)
goto err2;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
if (!pde)
goto err3;
#endif
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
err3:
remove_proc_entry("rt_cache", net->proc_net_stat);
#endif
@@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
{
remove_proc_entry("rt_cache", net->proc_net_stat);
remove_proc_entry("rt_cache", net->proc_net);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
remove_proc_entry("rt_acct", net->proc_net);
#endif
}
@@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
static inline int rt_valuable(struct rtable *rth)
{
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- rth->dst.expires;
+ (rth->peer && rth->peer->pmtu_expires);
}
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
if (atomic_read(&rth->dst.__refcnt))
goto out;
- ret = 1;
- if (rth->dst.expires &&
- time_after_eq(jiffies, rth->dst.expires))
- goto out;
-
age = jiffies - rth->dst.lastuse;
- ret = 0;
if ((age <= tmo1 && !rt_fast_clean(rth)) ||
(age <= tmo2 && rt_valuable(rth)))
goto out;
@@ -793,97 +820,6 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
return ONE;
}
-static void rt_check_expire(void)
-{
- static unsigned int rover;
- unsigned int i = rover, goal;
- struct rtable *rth;
- struct rtable __rcu **rthp;
- unsigned long samples = 0;
- unsigned long sum = 0, sum2 = 0;
- unsigned long delta;
- u64 mult;
-
- delta = jiffies - expires_ljiffies;
- expires_ljiffies = jiffies;
- mult = ((u64)delta) << rt_hash_log;
- if (ip_rt_gc_timeout > 1)
- do_div(mult, ip_rt_gc_timeout);
- goal = (unsigned int)mult;
- if (goal > rt_hash_mask)
- goal = rt_hash_mask + 1;
- for (; goal > 0; goal--) {
- unsigned long tmo = ip_rt_gc_timeout;
- unsigned long length;
-
- i = (i + 1) & rt_hash_mask;
- rthp = &rt_hash_table[i].chain;
-
- if (need_resched())
- cond_resched();
-
- samples++;
-
- if (rcu_dereference_raw(*rthp) == NULL)
- continue;
- length = 0;
- spin_lock_bh(rt_hash_lock_addr(i));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
- prefetch(rth->dst.rt_next);
- if (rt_is_expired(rth)) {
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- continue;
- }
- if (rth->dst.expires) {
- /* Entry is expired even if it is in use */
- if (time_before_eq(jiffies, rth->dst.expires)) {
-nofree:
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- /*
- * We only count entries on
- * a chain with equal hash inputs once
- * so that entries for different QOS
- * levels, and other non-hash input
- * attributes don't unfairly skew
- * the length computation
- */
- length += has_noalias(rt_hash_table[i].chain, rth);
- continue;
- }
- } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
- goto nofree;
-
- /* Cleanup aged off entries. */
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- }
- spin_unlock_bh(rt_hash_lock_addr(i));
- sum += length;
- sum2 += length*length;
- }
- if (samples) {
- unsigned long avg = sum / samples;
- unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
- rt_chain_length_max = max_t(unsigned long,
- ip_rt_gc_elasticity,
- (avg + 4*sd) >> FRACT_BITS);
- }
- rover = i;
-}
-
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
- rt_check_expire();
- schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-
/*
* Pertubation of rt_genid by a small quantity [1..256]
* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1272,6 +1208,13 @@ skip_hashing:
return 0;
}
+static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
+
+static u32 rt_peer_genid(void)
+{
+ return atomic_read(&__rt_peer_genid);
+}
+
void rt_bind_peer(struct rtable *rt, int create)
{
struct inet_peer *peer;
@@ -1280,6 +1223,8 @@ void rt_bind_peer(struct rtable *rt, int create)
if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
inet_putpeer(peer);
+ else
+ rt->rt_peer_genid = rt_peer_genid();
}
/*
@@ -1349,13 +1294,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
__be32 saddr, struct net_device *dev)
{
- int i, k;
struct in_device *in_dev = __in_dev_get_rcu(dev);
- struct rtable *rth;
- struct rtable __rcu **rthp;
- __be32 skeys[2] = { saddr, 0 };
- int ikeys[2] = { dev->ifindex, 0 };
- struct netevent_redirect netevent;
+ struct inet_peer *peer;
struct net *net;
if (!in_dev)
@@ -1367,9 +1307,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
ipv4_is_zeronet(new_gw))
goto reject_redirect;
- if (!rt_caching(net))
- goto reject_redirect;
-
if (!IN_DEV_SHARED_MEDIA(in_dev)) {
if (!inet_addr_onlink(in_dev, new_gw, old_gw))
goto reject_redirect;
@@ -1380,91 +1317,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
goto reject_redirect;
}
- for (i = 0; i < 2; i++) {
- for (k = 0; k < 2; k++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
- rt_genid(net));
-
- rthp = &rt_hash_table[hash].chain;
-
- while ((rth = rcu_dereference(*rthp)) != NULL) {
- struct rtable *rt;
-
- if (rth->fl.fl4_dst != daddr ||
- rth->fl.fl4_src != skeys[i] ||
- rth->fl.oif != ikeys[k] ||
- rt_is_input_route(rth) ||
- rt_is_expired(rth) ||
- !net_eq(dev_net(rth->dst.dev), net)) {
- rthp = &rth->dst.rt_next;
- continue;
- }
-
- if (rth->rt_dst != daddr ||
- rth->rt_src != saddr ||
- rth->dst.error ||
- rth->rt_gateway != old_gw ||
- rth->dst.dev != dev)
- break;
-
- dst_hold(&rth->dst);
-
- rt = dst_alloc(&ipv4_dst_ops);
- if (rt == NULL) {
- ip_rt_put(rth);
- return;
- }
-
- /* Copy all the information. */
- *rt = *rth;
- rt->dst.__use = 1;
- atomic_set(&rt->dst.__refcnt, 1);
- rt->dst.child = NULL;
- if (rt->dst.dev)
- dev_hold(rt->dst.dev);
- rt->dst.obsolete = -1;
- rt->dst.lastuse = jiffies;
- rt->dst.path = &rt->dst;
- rt->dst.neighbour = NULL;
- rt->dst.hh = NULL;
-#ifdef CONFIG_XFRM
- rt->dst.xfrm = NULL;
-#endif
- rt->rt_genid = rt_genid(net);
- rt->rt_flags |= RTCF_REDIRECTED;
-
- /* Gateway is different ... */
- rt->rt_gateway = new_gw;
-
- /* Redirect received -> path was valid */
- dst_confirm(&rth->dst);
-
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
-
- if (arp_bind_neighbour(&rt->dst) ||
- !(rt->dst.neighbour->nud_state &
- NUD_VALID)) {
- if (rt->dst.neighbour)
- neigh_event_send(rt->dst.neighbour, NULL);
- ip_rt_put(rth);
- rt_drop(rt);
- goto do_next;
- }
+ peer = inet_getpeer_v4(daddr, 1);
+ if (peer) {
+ peer->redirect_learned.a4 = new_gw;
- netevent.old = &rth->dst;
- netevent.new = &rt->dst;
- call_netevent_notifiers(NETEVENT_REDIRECT,
- &netevent);
+ inet_putpeer(peer);
- rt_del(hash, rth);
- if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
- ip_rt_put(rt);
- goto do_next;
- }
- do_next:
- ;
- }
+ atomic_inc(&__rt_peer_genid);
}
return;
@@ -1488,9 +1347,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
if (dst->obsolete > 0) {
ip_rt_put(rt);
ret = NULL;
- } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- (rt->dst.expires &&
- time_after_eq(jiffies, rt->dst.expires))) {
+ } else if (rt->rt_flags & RTCF_REDIRECTED) {
unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
rt->fl.oif,
rt_genid(dev_net(dst->dev)));
@@ -1500,6 +1357,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
#endif
rt_del(hash, rt);
ret = NULL;
+ } else if (rt->peer &&
+ rt->peer->pmtu_expires &&
+ time_after_eq(jiffies, rt->peer->pmtu_expires)) {
+ unsigned long orig = rt->peer->pmtu_expires;
+
+ if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
+ dst_metric_set(dst, RTAX_MTU,
+ rt->peer->pmtu_orig);
}
}
return ret;
@@ -1525,6 +1390,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct in_device *in_dev;
+ struct inet_peer *peer;
int log_martians;
rcu_read_lock();
@@ -1536,33 +1402,41 @@ void ip_rt_send_redirect(struct sk_buff *skb)
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
rcu_read_unlock();
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+ if (!peer) {
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+ return;
+ }
+
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
- if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
- rt->dst.rate_tokens = 0;
+ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+ peer->rate_tokens = 0;
/* Too many ignored redirects; do not send anything
* set dst.rate_last to the last seen redirected packet.
*/
- if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
- rt->dst.rate_last = jiffies;
+ if (peer->rate_tokens >= ip_rt_redirect_number) {
+ peer->rate_last = jiffies;
return;
}
/* Check for load limit; set rate_last to the latest sent
* redirect.
*/
- if (rt->dst.rate_tokens == 0 ||
+ if (peer->rate_tokens == 0 ||
time_after(jiffies,
- (rt->dst.rate_last +
- (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
+ (peer->rate_last +
+ (ip_rt_redirect_load << peer->rate_tokens)))) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
- rt->dst.rate_last = jiffies;
- ++rt->dst.rate_tokens;
+ peer->rate_last = jiffies;
+ ++peer->rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- rt->dst.rate_tokens == ip_rt_redirect_number &&
+ peer->rate_tokens == ip_rt_redirect_number &&
net_ratelimit())
printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
&rt->rt_src, rt->rt_iif,
@@ -1574,7 +1448,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
static int ip_error(struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
+ struct inet_peer *peer;
unsigned long now;
+ bool send;
int code;
switch (rt->dst.error) {
@@ -1594,15 +1470,24 @@ static int ip_error(struct sk_buff *skb)
break;
}
- now = jiffies;
- rt->dst.rate_tokens += now - rt->dst.rate_last;
- if (rt->dst.rate_tokens > ip_rt_error_burst)
- rt->dst.rate_tokens = ip_rt_error_burst;
- rt->dst.rate_last = now;
- if (rt->dst.rate_tokens >= ip_rt_error_cost) {
- rt->dst.rate_tokens -= ip_rt_error_cost;
- icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+
+ send = true;
+ if (peer) {
+ now = jiffies;
+ peer->rate_tokens += now - peer->rate_last;
+ if (peer->rate_tokens > ip_rt_error_burst)
+ peer->rate_tokens = ip_rt_error_burst;
+ peer->rate_last = now;
+ if (peer->rate_tokens >= ip_rt_error_cost)
+ peer->rate_tokens -= ip_rt_error_cost;
+ else
+ send = false;
}
+ if (send)
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
out: kfree_skb(skb);
return 0;
@@ -1630,88 +1515,130 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
unsigned short new_mtu,
struct net_device *dev)
{
- int i, k;
unsigned short old_mtu = ntohs(iph->tot_len);
- struct rtable *rth;
- int ikeys[2] = { dev->ifindex, 0 };
- __be32 skeys[2] = { iph->saddr, 0, };
- __be32 daddr = iph->daddr;
unsigned short est_mtu = 0;
+ struct inet_peer *peer;
- for (k = 0; k < 2; k++) {
- for (i = 0; i < 2; i++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
- rt_genid(net));
-
- rcu_read_lock();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->dst.rt_next)) {
- unsigned short mtu = new_mtu;
-
- if (rth->fl.fl4_dst != daddr ||
- rth->fl.fl4_src != skeys[i] ||
- rth->rt_dst != daddr ||
- rth->rt_src != iph->saddr ||
- rth->fl.oif != ikeys[k] ||
- rt_is_input_route(rth) ||
- dst_metric_locked(&rth->dst, RTAX_MTU) ||
- !net_eq(dev_net(rth->dst.dev), net) ||
- rt_is_expired(rth))
- continue;
-
- if (new_mtu < 68 || new_mtu >= old_mtu) {
+ peer = inet_getpeer_v4(iph->daddr, 1);
+ if (peer) {
+ unsigned short mtu = new_mtu;
- /* BSD 4.2 compatibility hack :-( */
- if (mtu == 0 &&
- old_mtu >= dst_mtu(&rth->dst) &&
- old_mtu >= 68 + (iph->ihl << 2))
- old_mtu -= iph->ihl << 2;
+ if (new_mtu < 68 || new_mtu >= old_mtu) {
+ /* BSD 4.2 derived systems incorrectly adjust
+ * tot_len by the IP header length, and report
+ * a zero MTU in the ICMP message.
+ */
+ if (mtu == 0 &&
+ old_mtu >= 68 + (iph->ihl << 2))
+ old_mtu -= iph->ihl << 2;
+ mtu = guess_mtu(old_mtu);
+ }
- mtu = guess_mtu(old_mtu);
- }
- if (mtu <= dst_mtu(&rth->dst)) {
- if (mtu < dst_mtu(&rth->dst)) {
- dst_confirm(&rth->dst);
- if (mtu < ip_rt_min_pmtu) {
- u32 lock = dst_metric(&rth->dst,
- RTAX_LOCK);
- mtu = ip_rt_min_pmtu;
- lock |= (1 << RTAX_MTU);
- dst_metric_set(&rth->dst, RTAX_LOCK,
- lock);
- }
- dst_metric_set(&rth->dst, RTAX_MTU, mtu);
- dst_set_expires(&rth->dst,
- ip_rt_mtu_expires);
- }
- est_mtu = mtu;
- }
- }
- rcu_read_unlock();
+ if (mtu < ip_rt_min_pmtu)
+ mtu = ip_rt_min_pmtu;
+ if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
+ est_mtu = mtu;
+ peer->pmtu_learned = mtu;
+ peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
}
+
+ inet_putpeer(peer);
+
+ atomic_inc(&__rt_peer_genid);
}
return est_mtu ? : new_mtu;
}
+static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
+{
+ unsigned long expires = peer->pmtu_expires;
+
+ if (time_before(expires, jiffies)) {
+ u32 orig_dst_mtu = dst_mtu(dst);
+ if (peer->pmtu_learned < orig_dst_mtu) {
+ if (!peer->pmtu_orig)
+ peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
+ dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
+ }
+ } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
+ dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
+}
+
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
{
- if (dst_mtu(dst) > mtu && mtu >= 68 &&
- !(dst_metric_locked(dst, RTAX_MTU))) {
- if (mtu < ip_rt_min_pmtu) {
- u32 lock = dst_metric(dst, RTAX_LOCK);
+ struct rtable *rt = (struct rtable *) dst;
+ struct inet_peer *peer;
+
+ dst_confirm(dst);
+
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+ if (peer) {
+ if (mtu < ip_rt_min_pmtu)
mtu = ip_rt_min_pmtu;
- dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
+ if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
+ peer->pmtu_learned = mtu;
+ peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
+
+ atomic_inc(&__rt_peer_genid);
+ rt->rt_peer_genid = rt_peer_genid();
+
+ check_peer_pmtu(dst, peer);
}
- dst_metric_set(dst, RTAX_MTU, mtu);
- dst_set_expires(dst, ip_rt_mtu_expires);
- call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
+ inet_putpeer(peer);
+ }
+}
+
+static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ __be32 orig_gw = rt->rt_gateway;
+
+ dst_confirm(&rt->dst);
+
+ neigh_release(rt->dst.neighbour);
+ rt->dst.neighbour = NULL;
+
+ rt->rt_gateway = peer->redirect_learned.a4;
+ if (arp_bind_neighbour(&rt->dst) ||
+ !(rt->dst.neighbour->nud_state & NUD_VALID)) {
+ if (rt->dst.neighbour)
+ neigh_event_send(rt->dst.neighbour, NULL);
+ rt->rt_gateway = orig_gw;
+ return -EAGAIN;
+ } else {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
+ rt->dst.neighbour);
}
+ return 0;
}
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
{
- if (rt_is_expired((struct rtable *)dst))
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (rt_is_expired(rt))
return NULL;
+ if (rt->rt_peer_genid != rt_peer_genid()) {
+ struct inet_peer *peer;
+
+ if (!rt->peer)
+ rt_bind_peer(rt, 0);
+
+ peer = rt->peer;
+ if (peer && peer->pmtu_expires)
+ check_peer_pmtu(dst, peer);
+
+ if (peer && peer->redirect_learned.a4 &&
+ peer->redirect_learned.a4 != rt->rt_gateway) {
+ if (check_peer_redir(dst, peer))
+ return NULL;
+ }
+
+ rt->rt_peer_genid = rt_peer_genid();
+ }
return dst;
}
@@ -1720,6 +1647,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
struct rtable *rt = (struct rtable *) dst;
struct inet_peer *peer = rt->peer;
+ if (rt->fi) {
+ fib_info_put(rt->fi);
+ rt->fi = NULL;
+ }
if (peer) {
rt->peer = NULL;
inet_putpeer(peer);
@@ -1734,8 +1665,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
rt = skb_rtable(skb);
- if (rt)
- dst_set_expires(&rt->dst, 0);
+ if (rt &&
+ rt->peer &&
+ rt->peer->pmtu_expires) {
+ unsigned long orig = rt->peer->pmtu_expires;
+
+ if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
+ dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
+ }
}
static int ip_rt_bug(struct sk_buff *skb)
@@ -1775,7 +1712,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
memcpy(addr, &src, 4);
}
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
static void set_class_tag(struct rtable *rt, u32 tag)
{
if (!(rt->dst.tclassid & 0xFFFF))
@@ -1815,17 +1752,52 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
return mtu;
}
-static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
+{
+ struct inet_peer *peer;
+ int create = 0;
+
+ /* If a peer entry exists for this destination, we must hook
+ * it up in order to get at cached metrics.
+ */
+ if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)
+ create = 1;
+
+ rt_bind_peer(rt, create);
+ peer = rt->peer;
+ if (peer) {
+ if (inet_metrics_new(peer))
+ memcpy(peer->metrics, fi->fib_metrics,
+ sizeof(u32) * RTAX_MAX);
+ dst_init_metrics(&rt->dst, peer->metrics, false);
+
+ if (peer->pmtu_expires)
+ check_peer_pmtu(&rt->dst, peer);
+ if (peer->redirect_learned.a4 &&
+ peer->redirect_learned.a4 != rt->rt_gateway) {
+ rt->rt_gateway = peer->redirect_learned.a4;
+ rt->rt_flags |= RTCF_REDIRECTED;
+ }
+ } else {
+ if (fi->fib_metrics != (u32 *) dst_default_metrics) {
+ rt->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ }
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+ }
+}
+
+static void rt_set_nexthop(struct rtable *rt, const struct fib_result *res,
+ struct fib_info *fi, u16 type, u32 itag)
{
struct dst_entry *dst = &rt->dst;
- struct fib_info *fi = res->fi;
if (fi) {
if (FIB_RES_GW(*res) &&
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = FIB_RES_GW(*res);
- dst_import_metrics(dst, fi->fib_metrics);
-#ifdef CONFIG_NET_CLS_ROUTE
+ rt_init_metrics(rt, fi);
+#ifdef CONFIG_IP_ROUTE_CLASSID
dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
}
@@ -1835,13 +1807,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
set_class_tag(rt, fib_rules_tclass(res));
#endif
set_class_tag(rt, itag);
#endif
- rt->rt_type = res->type;
+ rt->rt_type = type;
+}
+
+static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm)
+{
+ struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1);
+ if (rt) {
+ rt->dst.obsolete = -1;
+
+ rt->dst.flags = DST_HOST |
+ (nopolicy ? DST_NOPOLICY : 0) |
+ (noxfrm ? DST_NOXFRM : 0);
+ }
+ return rt;
}
/* called in rcu_read_lock() section */
@@ -1874,24 +1859,19 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (err < 0)
goto e_err;
}
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
rth->dst.output = ip_rt_bug;
- rth->dst.obsolete = -1;
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
rth->rt_iif =
@@ -1959,7 +1939,7 @@ static void ip_handle_martian_source(struct net_device *dev,
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
- struct fib_result *res,
+ const struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
struct rtable **result)
@@ -2013,19 +1993,13 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
-
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
- if (IN_DEV_CONF_GET(out_dev, NOXFRM))
- rth->dst.flags |= DST_NOXFRM;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
@@ -2040,12 +2014,11 @@ static int __mkroute_input(struct sk_buff *skb,
rth->fl.oif = 0;
rth->rt_spec_dst= spec_dst;
- rth->dst.obsolete = -1;
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
- rt_set_nexthop(rth, res, itag);
+ rt_set_nexthop(rth, res, res->fi, res->type, itag);
rth->rt_flags = flags;
@@ -2190,25 +2163,20 @@ brd_input:
RT_CACHE_STAT_INC(in_brd);
local_input:
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
rth->dst.output= ip_rt_bug;
- rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(net);
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
rth->rt_iif =
@@ -2351,38 +2319,39 @@ skip_cache:
EXPORT_SYMBOL(ip_route_input_common);
/* called with rcu_read_lock() */
-static int __mkroute_output(struct rtable **result,
- struct fib_result *res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
+static struct rtable *__mkroute_output(const struct fib_result *res,
+ const struct flowi *fl,
+ const struct flowi *oldflp,
+ struct net_device *dev_out,
+ unsigned int flags)
{
- struct rtable *rth;
- struct in_device *in_dev;
+ struct fib_info *fi = res->fi;
u32 tos = RT_FL_TOS(oldflp);
+ struct in_device *in_dev;
+ u16 type = res->type;
+ struct rtable *rth;
if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (ipv4_is_lbcast(fl->fl4_dst))
- res->type = RTN_BROADCAST;
+ type = RTN_BROADCAST;
else if (ipv4_is_multicast(fl->fl4_dst))
- res->type = RTN_MULTICAST;
+ type = RTN_MULTICAST;
else if (ipv4_is_zeronet(fl->fl4_dst))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
in_dev = __in_dev_get_rcu(dev_out);
if (!in_dev)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- if (res->type == RTN_BROADCAST) {
+ if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- res->fi = NULL;
- } else if (res->type == RTN_MULTICAST) {
+ fi = NULL;
+ } else if (type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST | RTCF_LOCAL;
if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
oldflp->proto))
@@ -2391,21 +2360,14 @@ static int __mkroute_output(struct rtable **result,
* default one, but do not gateway in this case.
* Yes, it is hack.
*/
- if (res->fi && res->prefixlen < 4)
- res->fi = NULL;
+ if (fi && res->prefixlen < 4)
+ fi = NULL;
}
-
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(in_dev, NOXFRM));
if (!rth)
- return -ENOBUFS;
-
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOXFRM))
- rth->dst.flags |= DST_NOXFRM;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
+ return ERR_PTR(-ENOBUFS);
rth->fl.fl4_dst = oldflp->fl4_dst;
rth->fl.fl4_tos = tos;
@@ -2423,7 +2385,6 @@ static int __mkroute_output(struct rtable **result,
rth->rt_spec_dst= fl->fl4_src;
rth->dst.output=ip_output;
- rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(dev_net(dev_out));
RT_CACHE_STAT_INC(out_slow_tot);
@@ -2440,7 +2401,7 @@ static int __mkroute_output(struct rtable **result,
RT_CACHE_STAT_INC(out_slow_mc);
}
#ifdef CONFIG_IP_MROUTE
- if (res->type == RTN_MULTICAST) {
+ if (type == RTN_MULTICAST) {
if (IN_DEV_MFORWARD(in_dev) &&
!ipv4_is_local_multicast(oldflp->fl4_dst)) {
rth->dst.input = ip_mr_input;
@@ -2450,31 +2411,10 @@ static int __mkroute_output(struct rtable **result,
#endif
}
- rt_set_nexthop(rth, res, 0);
+ rt_set_nexthop(rth, res, fi, type, 0);
rth->rt_flags = flags;
- *result = rth;
- return 0;
-}
-
-/* called with rcu_read_lock() */
-static int ip_mkroute_output(struct rtable **rp,
- struct fib_result *res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
-{
- struct rtable *rth = NULL;
- int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
- unsigned hash;
- if (err == 0) {
- hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
- rt_genid(dev_net(dev_out)));
- err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
- }
-
- return err;
+ return rth;
}
/*
@@ -2497,6 +2437,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
struct fib_result res;
unsigned int flags = 0;
struct net_device *dev_out = NULL;
+ struct rtable *rth;
int err;
@@ -2505,6 +2446,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
res.r = NULL;
#endif
+ rcu_read_lock();
if (oldflp->fl4_src) {
err = -EINVAL;
if (ipv4_is_multicast(oldflp->fl4_src) ||
@@ -2645,7 +2587,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
else
#endif
if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
- fib_select_default(net, &fl, &res);
+ fib_select_default(&res);
if (!fl.fl4_src)
fl.fl4_src = FIB_RES_PREFSRC(res);
@@ -2655,17 +2597,27 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
make_route:
- err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
+ rth = __mkroute_output(&res, &fl, oldflp, dev_out, flags);
+ if (IS_ERR(rth))
+ err = PTR_ERR(rth);
+ else {
+ unsigned int hash;
-out: return err;
+ hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
+ rt_genid(dev_net(dev_out)));
+ err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
+ }
+
+out:
+ rcu_read_unlock();
+ return err;
}
int __ip_route_output_key(struct net *net, struct rtable **rp,
const struct flowi *flp)
{
- unsigned int hash;
- int res;
struct rtable *rth;
+ unsigned int hash;
if (!rt_caching(net))
goto slow_output;
@@ -2695,10 +2647,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
rcu_read_unlock_bh();
slow_output:
- rcu_read_lock();
- res = ip_route_output_slow(net, rp, flp);
- rcu_read_unlock();
- return res;
+ return ip_route_output_slow(net, rp, flp);
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
@@ -2707,6 +2656,11 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
return NULL;
}
+static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
+{
+ return 0;
+}
+
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
{
}
@@ -2716,6 +2670,8 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.destroy = ipv4_dst_destroy,
.check = ipv4_blackhole_dst_check,
+ .default_mtu = ipv4_blackhole_default_mtu,
+ .default_advmss = ipv4_default_advmss,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
};
@@ -2724,12 +2680,11 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
{
struct rtable *ort = *rp;
struct rtable *rt = (struct rtable *)
- dst_alloc(&ipv4_dst_blackhole_ops);
+ dst_alloc(&ipv4_dst_blackhole_ops, 1);
if (rt) {
struct dst_entry *new = &rt->dst;
- atomic_set(&new->__refcnt, 1);
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard;
@@ -2752,6 +2707,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
rt->peer = ort->peer;
if (rt->peer)
atomic_inc(&rt->peer->refcnt);
+ rt->fi = ort->fi;
+ if (rt->fi)
+ atomic_inc(&rt->fi->fib_clntref);
dst_free(new);
}
@@ -2828,7 +2786,7 @@ static int rt_fill_info(struct net *net,
}
if (rt->dst.dev)
NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (rt->dst.tclassid)
NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
#endif
@@ -2847,7 +2805,8 @@ static int rt_fill_info(struct net *net,
NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
error = rt->dst.error;
- expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
+ expires = (rt->peer && rt->peer->pmtu_expires) ?
+ rt->peer->pmtu_expires - jiffies : 0;
if (rt->peer) {
inet_peer_refcheck(rt->peer);
id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
@@ -3249,9 +3208,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
};
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
-#endif /* CONFIG_NET_CLS_ROUTE */
+#endif /* CONFIG_IP_ROUTE_CLASSID */
static __initdata unsigned long rhash_entries;
static int __init set_rhash_entries(char *str)
@@ -3267,7 +3226,7 @@ int __init ip_rt_init(void)
{
int rc = 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
@@ -3304,14 +3263,6 @@ int __init ip_rt_init(void)
devinet_init();
ip_fib_init();
- /* All the timers, started at system startup tend
- to synchronize. Perturb it a bit.
- */
- INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
- expires_ljiffies = jiffies;
- schedule_delayed_work(&expires_work,
- net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
-
if (ip_rt_proc_init())
printk(KERN_ERR "Unable to create route proc files\n");
#ifdef CONFIG_XFRM
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c11eece262..a17a5a72b98 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -873,9 +873,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
flags);
lock_sock(sk);
- TCP_CHECK_TIMER(sk);
res = do_tcp_sendpages(sk, &page, offset, size, flags);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return res;
}
@@ -916,7 +914,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
long timeo;
lock_sock(sk);
- TCP_CHECK_TIMER(sk);
flags = msg->msg_flags;
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -1104,7 +1101,6 @@ wait_for_memory:
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return copied;
@@ -1123,7 +1119,6 @@ do_error:
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
}
@@ -1415,8 +1410,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
lock_sock(sk);
- TCP_CHECK_TIMER(sk);
-
err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
@@ -1767,12 +1760,10 @@ skip_copy:
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return copied;
out:
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
@@ -2653,7 +2644,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct tcphdr *th;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2549b29b062..2f692cefd3b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd)
- cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
+ cwnd = TCP_INIT_CWND;
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
@@ -4399,7 +4399,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
- eaten = (chunk == skb->len && !th->fin);
+ eaten = (chunk == skb->len);
tcp_rcv_space_adjust(sk);
}
local_bh_disable();
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 856f68466d4..ef5a90beb9b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1341,7 +1341,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_death_row.sysctl_tw_recycle &&
(dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
- peer->daddr.a4 == saddr) {
+ peer->daddr.addr.a4 == saddr) {
inet_peer_refcheck(peer);
if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
@@ -1556,12 +1556,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
sock_rps_save_rxhash(sk, skb->rxhash);
- TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
- TCP_CHECK_TIMER(sk);
return 0;
}
@@ -1583,13 +1581,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
} else
sock_rps_save_rxhash(sk, skb->rxhash);
-
- TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
- TCP_CHECK_TIMER(sk);
return 0;
reset:
@@ -1994,7 +1989,6 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
}
req = req->dl_next;
}
- st->offset = 0;
if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break;
get_req:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74a6aa00365..ecd44b0c45f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -259,7 +259,6 @@ static void tcp_delack_timer(unsigned long data)
tcp_send_ack(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
- TCP_CHECK_TIMER(sk);
out:
if (tcp_memory_pressure)
@@ -481,7 +480,6 @@ static void tcp_write_timer(unsigned long data)
tcp_probe_timer(sk);
break;
}
- TCP_CHECK_TIMER(sk);
out:
sk_mem_reclaim(sk);
@@ -589,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = keepalive_time_when(tp) - elapsed;
}
- TCP_CHECK_TIMER(sk);
sk_mem_reclaim(sk);
resched:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8157b17959e..d37baaa1dbe 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2199,7 +2199,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
return 0;
}
-struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
+struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b057d40adde..19fbdec6baa 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -196,8 +196,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ dst_destroy_metrics_generic(dst);
+
if (likely(xdst->u.rt.peer))
inet_putpeer(xdst->u.rt.peer);
+
xfrm_dst_destroy(xdst);
}
@@ -215,6 +218,7 @@ static struct dst_ops xfrm4_dst_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5b189c97c2f..fd6782e3a03 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -420,9 +420,6 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
dev->type == ARPHRD_TUNNEL6 ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_NONE) {
- printk(KERN_INFO
- "%s: Disabled Privacy Extensions\n",
- dev->name);
ndev->cnf.use_tempaddr = -1;
} else {
in6_dev_hold(ndev);
@@ -2664,14 +2661,12 @@ static int addrconf_ifdown(struct net_device *dev, int how)
struct net *net = dev_net(dev);
struct inet6_dev *idev;
struct inet6_ifaddr *ifa;
- LIST_HEAD(keep_list);
- int state;
+ int state, i;
ASSERT_RTNL();
- /* Flush routes if device is being removed or it is not loopback */
- if (how || !(dev->flags & IFF_LOOPBACK))
- rt6_ifdown(net, dev);
+ rt6_ifdown(net, dev);
+ neigh_ifdown(&nd_tbl, dev);
idev = __in6_dev_get(dev);
if (idev == NULL)
@@ -2692,6 +2687,23 @@ static int addrconf_ifdown(struct net_device *dev, int how)
}
+ /* Step 2: clear hash table */
+ for (i = 0; i < IN6_ADDR_HSIZE; i++) {
+ struct hlist_head *h = &inet6_addr_lst[i];
+ struct hlist_node *n;
+
+ spin_lock_bh(&addrconf_hash_lock);
+ restart:
+ hlist_for_each_entry_rcu(ifa, n, h, addr_lst) {
+ if (ifa->idev == idev) {
+ hlist_del_init_rcu(&ifa->addr_lst);
+ addrconf_del_timer(ifa);
+ goto restart;
+ }
+ }
+ spin_unlock_bh(&addrconf_hash_lock);
+ }
+
write_lock_bh(&idev->lock);
/* Step 2: clear flags for stateless addrconf */
@@ -2725,52 +2737,23 @@ static int addrconf_ifdown(struct net_device *dev, int how)
struct inet6_ifaddr, if_list);
addrconf_del_timer(ifa);
- /* If just doing link down, and address is permanent
- and not link-local, then retain it. */
- if (!how &&
- (ifa->flags&IFA_F_PERMANENT) &&
- !(ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)) {
- list_move_tail(&ifa->if_list, &keep_list);
-
- /* If not doing DAD on this address, just keep it. */
- if ((dev->flags&(IFF_NOARP|IFF_LOOPBACK)) ||
- idev->cnf.accept_dad <= 0 ||
- (ifa->flags & IFA_F_NODAD))
- continue;
+ list_del(&ifa->if_list);
- /* If it was tentative already, no need to notify */
- if (ifa->flags & IFA_F_TENTATIVE)
- continue;
+ write_unlock_bh(&idev->lock);
- /* Flag it for later restoration when link comes up */
- ifa->flags |= IFA_F_TENTATIVE;
- ifa->state = INET6_IFADDR_STATE_DAD;
- } else {
- list_del(&ifa->if_list);
-
- /* clear hash table */
- spin_lock_bh(&addrconf_hash_lock);
- hlist_del_init_rcu(&ifa->addr_lst);
- spin_unlock_bh(&addrconf_hash_lock);
-
- write_unlock_bh(&idev->lock);
- spin_lock_bh(&ifa->state_lock);
- state = ifa->state;
- ifa->state = INET6_IFADDR_STATE_DEAD;
- spin_unlock_bh(&ifa->state_lock);
-
- if (state != INET6_IFADDR_STATE_DEAD) {
- __ipv6_ifa_notify(RTM_DELADDR, ifa);
- atomic_notifier_call_chain(&inet6addr_chain,
- NETDEV_DOWN, ifa);
- }
+ spin_lock_bh(&ifa->state_lock);
+ state = ifa->state;
+ ifa->state = INET6_IFADDR_STATE_DEAD;
+ spin_unlock_bh(&ifa->state_lock);
- in6_ifa_put(ifa);
- write_lock_bh(&idev->lock);
+ if (state != INET6_IFADDR_STATE_DEAD) {
+ __ipv6_ifa_notify(RTM_DELADDR, ifa);
+ atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa);
}
- }
+ in6_ifa_put(ifa);
- list_splice(&keep_list, &idev->addr_list);
+ write_lock_bh(&idev->lock);
+ }
write_unlock_bh(&idev->lock);
@@ -4159,8 +4142,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
addrconf_leave_solict(ifp->idev, &ifp->addr);
dst_hold(&ifp->rt->dst);
- if (ifp->state == INET6_IFADDR_STATE_DEAD &&
- ip6_del_rt(ifp->rt))
+ if (ip6_del_rt(ifp->rt))
dst_free(&ifp->rt->dst);
break;
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 978e80e2c4a..3194aa90987 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -772,7 +772,7 @@ out:
return err;
}
-static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct ipv6hdr *ipv6h;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 03e62f94ff8..a31d91b04c8 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -157,20 +157,20 @@ static int is_ineligible(struct sk_buff *skb)
/*
* Check the ICMP output rate limit
*/
-static inline int icmpv6_xrlim_allow(struct sock *sk, u8 type,
- struct flowi *fl)
+static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
+ struct flowi *fl)
{
struct dst_entry *dst;
struct net *net = sock_net(sk);
- int res = 0;
+ bool res = false;
/* Informational messages are not limited. */
if (type & ICMPV6_INFOMSG_MASK)
- return 1;
+ return true;
/* Do not limit pmtu discovery, it would break it. */
if (type == ICMPV6_PKT_TOOBIG)
- return 1;
+ return true;
/*
* Look up the output route.
@@ -182,7 +182,7 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, u8 type,
IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_OUTNOROUTES);
} else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
- res = 1;
+ res = true;
} else {
struct rt6_info *rt = (struct rt6_info *)dst;
int tmo = net->ipv6.sysctl.icmpv6_time;
@@ -191,7 +191,9 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, u8 type,
if (rt->rt6i_dst.plen < 128)
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
- res = xrlim_allow(dst, tmo);
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 1);
+ res = inet_peer_xrlim_allow(rt->rt6i_peer, tmo);
}
dst_release(dst);
return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5f8d242be3f..2600e228872 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -479,10 +479,13 @@ int ip6_forward(struct sk_buff *skb)
else
target = &hdr->daddr;
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 1);
+
/* Limit redirects both by destination (here)
and by source (inside ndisc_send_redirect)
*/
- if (xrlim_allow(dst, 1*HZ))
+ if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
ndisc_send_redirect(skb, n, target);
} else {
int addrtype = ipv6_addr_type(&hdr->saddr);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 9fab274019c..0e1d53bcf1e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -34,6 +34,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/compat.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
@@ -1804,6 +1805,80 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
}
}
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req6 {
+ struct sockaddr_in6 src;
+ struct sockaddr_in6 grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_mif_req6 {
+ mifi_t mifi;
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req6 sr;
+ struct compat_sioc_mif_req6 vr;
+ struct mif_device *vif;
+ struct mfc6_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr6_table *mrt;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (mrt == NULL)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETMIFCNT_IN6:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.mifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif6_table[vr.mifi];
+ if (MIF_EXISTS(mrt, vr.mifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT_IN6:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ read_lock(&mrt_lock);
+ c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
static inline int ip6mr_forward2_finish(struct sk_buff *skb)
{
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 2342545a5ee..7254ce36400 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1553,7 +1553,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
"ICMPv6 Redirect: destination is not a neighbour.\n");
goto release;
}
- if (!xrlim_allow(dst, 1*HZ))
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 1);
+ if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
goto release;
if (dev->addr_len) {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7d227c644f7..47b7b8df7fa 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1076,6 +1076,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(AF_INET6, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -1679,6 +1680,7 @@ translate_compat_table(struct net *net,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET6);
+ xt_compat_init_offsets(AF_INET6, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 09c88891a75..e6af8d72f26 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -410,7 +410,7 @@ fallback:
if (p != NULL) {
sb_add(m, "%02x", *p++);
for (i = 1; i < len; i++)
- sb_add(m, ":%02x", p[i]);
+ sb_add(m, ":%02x", *p++);
}
sb_add(m, " ");
@@ -452,8 +452,7 @@ ip6t_log_packet(u_int8_t pf,
in ? in->name : "",
out ? out->name : "");
- /* MAC logging for input path only. */
- if (in && !out)
+ if (in != NULL)
dump_mac_header(m, loginfo, skb);
dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 79d43aa8fa8..08572726381 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -45,6 +45,7 @@
#include <linux/netfilter_ipv6.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
struct nf_ct_frag6_skb_cb
@@ -73,7 +74,7 @@ static struct inet_frags nf_frags;
static struct netns_frags nf_init_frags;
#ifdef CONFIG_SYSCTL
-struct ctl_table nf_ct_frag6_sysctl_table[] = {
+static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{
.procname = "nf_conntrack_frag6_timeout",
.data = &nf_init_frags.timeout,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 86c39526ba5..364e8668338 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -31,6 +31,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <linux/skbuff.h>
+#include <linux/compat.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -123,18 +124,18 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
}
#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
-static int (*mh_filter)(struct sock *sock, struct sk_buff *skb);
+typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb);
-int rawv6_mh_filter_register(int (*filter)(struct sock *sock,
- struct sk_buff *skb))
+static mh_filter_t __rcu *mh_filter __read_mostly;
+
+int rawv6_mh_filter_register(mh_filter_t filter)
{
rcu_assign_pointer(mh_filter, filter);
return 0;
}
EXPORT_SYMBOL(rawv6_mh_filter_register);
-int rawv6_mh_filter_unregister(int (*filter)(struct sock *sock,
- struct sk_buff *skb))
+int rawv6_mh_filter_unregister(mh_filter_t filter)
{
rcu_assign_pointer(mh_filter, NULL);
synchronize_rcu();
@@ -192,10 +193,10 @@ static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
* policy is placed in rawv6_rcv() because it is
* required for each socket.
*/
- int (*filter)(struct sock *sock, struct sk_buff *skb);
+ mh_filter_t *filter;
filter = rcu_dereference(mh_filter);
- filtered = filter ? filter(sk, skb) : 0;
+ filtered = filter ? (*filter)(sk, skb) : 0;
break;
}
#endif
@@ -1157,6 +1158,23 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
}
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
+#ifdef CONFIG_IPV6_MROUTE
+ return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+#endif
+
static void rawv6_close(struct sock *sk, long timeout)
{
if (inet_sk(sk)->inet_num == IPPROTO_RAW)
@@ -1215,6 +1233,7 @@ struct proto rawv6_prot = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_rawv6_setsockopt,
.compat_getsockopt = compat_rawv6_getsockopt,
+ .compat_ioctl = compat_rawv6_ioctl,
#endif
};
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 373bd0416f6..f786aed02a6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -72,8 +72,6 @@
#define RT6_TRACE(x...) do { ; } while (0)
#endif
-#define CLONE_OFFLINK_ROUTE 0
-
static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
@@ -99,6 +97,36 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
struct in6_addr *gwaddr, int ifindex);
#endif
+static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ struct rt6_info *rt = (struct rt6_info *) dst;
+ struct inet_peer *peer;
+ u32 *p = NULL;
+
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 1);
+
+ peer = rt->rt6i_peer;
+ if (peer) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ p = peer->metrics;
+ if (inet_metrics_new(peer))
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ }
+ }
+ return p;
+}
+
static struct dst_ops ip6_dst_ops_template = {
.family = AF_INET6,
.protocol = cpu_to_be16(ETH_P_IPV6),
@@ -107,6 +135,7 @@ static struct dst_ops ip6_dst_ops_template = {
.check = ip6_dst_check,
.default_advmss = ip6_default_advmss,
.default_mtu = ip6_default_mtu,
+ .cow_metrics = ipv6_cow_metrics,
.destroy = ip6_dst_destroy,
.ifdown = ip6_dst_ifdown,
.negative_advice = ip6_negative_advice,
@@ -115,6 +144,11 @@ static struct dst_ops ip6_dst_ops_template = {
.local_out = __ip6_local_out,
};
+static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
+{
+ return 0;
+}
+
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
{
}
@@ -124,9 +158,15 @@ static struct dst_ops ip6_dst_blackhole_ops = {
.protocol = cpu_to_be16(ETH_P_IPV6),
.destroy = ip6_dst_destroy,
.check = ip6_dst_check,
+ .default_mtu = ip6_blackhole_default_mtu,
+ .default_advmss = ip6_default_advmss,
.update_pmtu = ip6_rt_blackhole_update_pmtu,
};
+static const u32 ip6_template_metrics[RTAX_MAX] = {
+ [RTAX_HOPLIMIT - 1] = 255,
+};
+
static struct rt6_info ip6_null_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
@@ -182,7 +222,7 @@ static struct rt6_info ip6_blk_hole_entry_template = {
/* allocate dst with ip6_dst_ops */
static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
{
- return (struct rt6_info *)dst_alloc(ops);
+ return (struct rt6_info *)dst_alloc(ops, 0);
}
static void ip6_dst_destroy(struct dst_entry *dst)
@@ -196,22 +236,27 @@ static void ip6_dst_destroy(struct dst_entry *dst)
in6_dev_put(idev);
}
if (peer) {
- BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
rt->rt6i_peer = NULL;
inet_putpeer(peer);
}
}
+static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
+
+static u32 rt6_peer_genid(void)
+{
+ return atomic_read(&__rt6_peer_genid);
+}
+
void rt6_bind_peer(struct rt6_info *rt, int create)
{
struct inet_peer *peer;
- if (WARN_ON(!(rt->rt6i_flags & RTF_CACHE)))
- return;
-
peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
inet_putpeer(peer);
+ else
+ rt->rt6i_peer_genid = rt6_peer_genid();
}
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -738,13 +783,8 @@ restart:
if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
- else {
-#if CLONE_OFFLINK_ROUTE
+ else
nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
-#else
- goto out2;
-#endif
- }
dst_release(&rt->dst);
rt = nrt ? : net->ipv6.ip6_null_entry;
@@ -834,13 +874,12 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl
{
struct rt6_info *ort = (struct rt6_info *) *dstp;
struct rt6_info *rt = (struct rt6_info *)
- dst_alloc(&ip6_dst_blackhole_ops);
+ dst_alloc(&ip6_dst_blackhole_ops, 1);
struct dst_entry *new = NULL;
if (rt) {
new = &rt->dst;
- atomic_set(&new->__refcnt, 1);
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard;
@@ -882,9 +921,14 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
rt = (struct rt6_info *) dst;
- if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
+ if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
+ if (rt->rt6i_peer_genid != rt6_peer_genid()) {
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 0);
+ rt->rt6i_peer_genid = rt6_peer_genid();
+ }
return dst;
-
+ }
return NULL;
}
@@ -935,7 +979,6 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
dst_metric_set(dst, RTAX_FEATURES, features);
}
dst_metric_set(dst, RTAX_MTU, mtu);
- call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
}
}
@@ -2688,7 +2731,8 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_null_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_null_entry;
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
+ dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
+ ip6_template_metrics, true);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
@@ -2699,7 +2743,8 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_prohibit_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
+ dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
+ ip6_template_metrics, true);
net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -2709,7 +2754,8 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_blk_hole_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
+ dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
+ ip6_template_metrics, true);
#endif
net->ipv6.sysctl.flush_delay = 0;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 8ce38f10a54..b1599a345c1 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -412,7 +412,7 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
p = container_of(head, struct ip_tunnel_prl_entry, rcu_head);
do {
- n = p->next;
+ n = rcu_dereference_protected(p->next, 1);
kfree(p);
p = n;
} while (p);
@@ -421,15 +421,17 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
static int
ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
{
- struct ip_tunnel_prl_entry *x, **p;
+ struct ip_tunnel_prl_entry *x;
+ struct ip_tunnel_prl_entry __rcu **p;
int err = 0;
ASSERT_RTNL();
if (a && a->addr != htonl(INADDR_ANY)) {
- for (p = &t->prl; *p; p = &(*p)->next) {
- if ((*p)->addr == a->addr) {
- x = *p;
+ for (p = &t->prl;
+ (x = rtnl_dereference(*p)) != NULL;
+ p = &x->next) {
+ if (x->addr == a->addr) {
*p = x->next;
call_rcu(&x->rcu_head, prl_entry_destroy_rcu);
t->prl_count--;
@@ -438,9 +440,9 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
}
err = -ENXIO;
} else {
- if (t->prl) {
+ x = rtnl_dereference(t->prl);
+ if (x) {
t->prl_count = 0;
- x = t->prl;
call_rcu(&x->rcu_head, prl_list_destroy_rcu);
t->prl = NULL;
}
@@ -1179,7 +1181,7 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
if (!dev->tstats)
return -ENOMEM;
dev_hold(dev);
- sitn->tunnels_wc[0] = tunnel;
+ rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
return 0;
}
@@ -1196,11 +1198,12 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea
for (prio = 1; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t = sitn->tunnels[prio][h];
+ struct ip_tunnel *t;
+ t = rtnl_dereference(sitn->tunnels[prio][h]);
while (t != NULL) {
unregister_netdevice_queue(t->dev, head);
- t = t->next;
+ t = rtnl_dereference(t->next);
}
}
}
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index fa1d8f4e005..7cb65ef79f9 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -15,6 +15,8 @@
#include <net/addrconf.h>
#include <net/inet_frag.h>
+static struct ctl_table empty[1];
+
static ctl_table ipv6_table_template[] = {
{
.procname = "route",
@@ -35,6 +37,12 @@ static ctl_table ipv6_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "neigh",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = empty,
+ },
{ }
};
@@ -152,7 +160,6 @@ static struct ctl_table_header *ip6_base;
int ipv6_static_sysctl_register(void)
{
- static struct ctl_table empty[1];
ip6_base = register_sysctl_paths(net_ipv6_ctl_path, empty);
if (ip6_base == NULL)
return -ENOMEM;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 20aa95e3735..1d0ab557090 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1323,7 +1323,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_death_row.sysctl_tw_recycle &&
(dst = inet6_csk_route_req(sk, req)) != NULL &&
(peer = rt6_get_peer((struct rt6_info *)dst)) != NULL &&
- ipv6_addr_equal((struct in6_addr *)peer->daddr.a6,
+ ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6,
&treq->rmt_addr)) {
inet_peer_refcheck(peer);
if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
@@ -1636,10 +1636,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
opt_skb = skb_clone(skb, GFP_ATOMIC);
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len))
goto reset;
- TCP_CHECK_TIMER(sk);
if (opt_skb)
goto ipv6_pktoptions;
return 0;
@@ -1667,10 +1665,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
}
}
- TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len))
goto reset;
- TCP_CHECK_TIMER(sk);
if (opt_skb)
goto ipv6_pktoptions;
return 0;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9a009c66c8a..a419a787eb6 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1299,7 +1299,7 @@ static int udp6_ufo_send_check(struct sk_buff *skb)
return 0;
}
-static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, int features)
+static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 7e74023ea6e..834dc02f1d4 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -98,6 +98,10 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
if (!xdst->u.rt6.rt6i_idev)
return -ENODEV;
+ xdst->u.rt6.rt6i_peer = rt->rt6i_peer;
+ if (rt->rt6i_peer)
+ atomic_inc(&rt->rt6i_peer->refcnt);
+
/* Sheit... I remember I did this right. Apparently,
* it was magically lost, so this code needs audit */
xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
@@ -216,6 +220,9 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
if (likely(xdst->u.rt6.rt6i_idev))
in6_dev_put(xdst->u.rt6.rt6i_idev);
+ dst_destroy_metrics_generic(dst);
+ if (likely(xdst->u.rt6.rt6i_peer))
+ inet_putpeer(xdst->u.rt6.rt6i_peer);
xfrm_dst_destroy(xdst);
}
@@ -251,6 +258,7 @@ static struct dst_ops xfrm6_dst_ops = {
.protocol = cpu_to_be16(ETH_P_IPV6),
.gc = xfrm6_garbage_collect,
.update_pmtu = xfrm6_update_pmtu,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm6_dst_destroy,
.ifdown = xfrm6_dst_ifdown,
.local_out = __ip6_local_out,
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 4c57a9c3729..dbf5e4006bc 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -20,7 +20,7 @@ config MAC80211_HAS_RC
bool
config MAC80211_RC_PID
- bool "PID controller based rate control algorithm" if EMBEDDED
+ bool "PID controller based rate control algorithm" if EXPERT
select MAC80211_HAS_RC
---help---
This option enables a TX rate control algorithm for
@@ -28,14 +28,14 @@ config MAC80211_RC_PID
rate.
config MAC80211_RC_MINSTREL
- bool "Minstrel" if EMBEDDED
+ bool "Minstrel" if EXPERT
select MAC80211_HAS_RC
default y
---help---
This option enables the 'minstrel' TX rate control algorithm
config MAC80211_RC_MINSTREL_HT
- bool "Minstrel 802.11n support" if EMBEDDED
+ bool "Minstrel 802.11n support" if EXPERT
depends on MAC80211_RC_MINSTREL
default y
---help---
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 1534f2b44ca..82a6e0d80f0 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -85,6 +85,17 @@ config NF_CONNTRACK_EVENTS
If unsure, say `N'.
+config NF_CONNTRACK_TIMESTAMP
+ bool 'Connection tracking timestamping'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timestamping.
+ This allows you to store the flow start-time and to obtain
+ the flow-stop time (once it has been destroyed) via Connection
+ tracking events.
+
+ If unsure, say `N'.
+
config NF_CT_PROTO_DCCP
tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)'
depends on EXPERIMENTAL
@@ -185,9 +196,13 @@ config NF_CONNTRACK_IRC
To compile it as a module, choose M here. If unsure, say N.
+config NF_CONNTRACK_BROADCAST
+ tristate
+
config NF_CONNTRACK_NETBIOS_NS
tristate "NetBIOS name service protocol support"
depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_BROADCAST
help
NetBIOS name service requests are sent as broadcast messages from an
unprivileged port and responded to with unicast messages to the
@@ -204,6 +219,21 @@ config NF_CONNTRACK_NETBIOS_NS
To compile it as a module, choose M here. If unsure, say N.
+config NF_CONNTRACK_SNMP
+ tristate "SNMP service protocol support"
+ depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_BROADCAST
+ help
+ SNMP service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating SNMP service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NF_CONNTRACK_PPTP
tristate "PPtP protocol support"
depends on NETFILTER_ADVANCED
@@ -322,10 +352,32 @@ config NETFILTER_XT_CONNMARK
ctmark), similarly to the packet mark (nfmark). Using this
target and match, you can set and match on this mark.
+config NETFILTER_XT_SET
+ tristate 'set target and match support'
+ depends on IP_SET
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds the "SET" target and "set" match.
+
+ Using this target and match, you can add/delete and match
+ elements in the sets created by ipset(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# alphabetically ordered list of targets
comment "Xtables targets"
+config NETFILTER_XT_TARGET_AUDIT
+ tristate "AUDIT target support"
+ depends on AUDIT
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a 'AUDIT' target, which can be used to create
+ audit records for packets dropped/accepted.
+
+ To compileit as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_CHECKSUM
tristate "CHECKSUM target support"
depends on IP_NF_MANGLE || IP6_NF_MANGLE
@@ -477,6 +529,7 @@ config NETFILTER_XT_TARGET_NFLOG
config NETFILTER_XT_TARGET_NFQUEUE
tristate '"NFQUEUE" target Support'
depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_QUEUE
help
This target replaced the old obsolete QUEUE target.
@@ -685,6 +738,15 @@ config NETFILTER_XT_MATCH_DCCP
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+config NETFILTER_XT_MATCH_DEVGROUP
+ tristate '"devgroup" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This options adds a `devgroup' match, which allows to match on the
+ device group a network device is assigned to.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_DSCP
tristate '"dscp" and "tos" match support'
depends on NETFILTER_ADVANCED
@@ -886,7 +948,7 @@ config NETFILTER_XT_MATCH_RATEEST
config NETFILTER_XT_MATCH_REALM
tristate '"realm" match support'
depends on NETFILTER_ADVANCED
- select NET_CLS_ROUTE
+ select IP_ROUTE_CLASSID
help
This option adds a `realm' match, which allows you to use the realm
key from the routing subsystem inside iptables.
@@ -1011,4 +1073,6 @@ endif # NETFILTER_XTABLES
endmenu
+source "net/netfilter/ipset/Kconfig"
+
source "net/netfilter/ipvs/Kconfig"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 441050f3111..d57a890eaee 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,6 +1,7 @@
netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
obj-$(CONFIG_NETFILTER) = netfilter.o
@@ -28,7 +29,9 @@ obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o
obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o
obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o
+obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o
obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o
+obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o
obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o
obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
@@ -43,8 +46,10 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
# combos
obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
+obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
# targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
@@ -72,6 +77,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o
obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o
obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
@@ -101,5 +107,8 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+# ipset
+obj-$(CONFIG_IP_SET) += ipset/
+
# IPVS
obj-$(CONFIG_IP_VS) += ipvs/
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 32fcbe290c0..899b71c0ff5 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -133,6 +133,7 @@ unsigned int nf_iterate(struct list_head *head,
/* Optimization: we don't need to hold module
reference here, since function can't sleep. --RR */
+repeat:
verdict = elem->hook(hook, skb, indev, outdev, okfn);
if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
@@ -145,7 +146,7 @@ unsigned int nf_iterate(struct list_head *head,
#endif
if (verdict != NF_REPEAT)
return verdict;
- *i = (*i)->prev;
+ goto repeat;
}
}
return NF_ACCEPT;
@@ -175,13 +176,21 @@ next_hook:
ret = 1;
} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
kfree_skb(skb);
- ret = -(verdict >> NF_VERDICT_BITS);
+ ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
- if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
- verdict >> NF_VERDICT_BITS))
- goto next_hook;
+ ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
+ verdict >> NF_VERDICT_QBITS);
+ if (ret < 0) {
+ if (ret == -ECANCELED)
+ goto next_hook;
+ if (ret == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
+ ret = 0;
}
rcu_read_unlock();
return ret;
@@ -214,7 +223,7 @@ EXPORT_SYMBOL(skb_make_writable);
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly;
EXPORT_SYMBOL(ip_ct_attach);
void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
@@ -231,7 +240,7 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
}
EXPORT_SYMBOL(nf_ct_attach);
-void (*nf_ct_destroy)(struct nf_conntrack *);
+void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
EXPORT_SYMBOL(nf_ct_destroy);
void nf_conntrack_destroy(struct nf_conntrack *nfct)
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
new file mode 100644
index 00000000000..3b970d34302
--- /dev/null
+++ b/net/netfilter/ipset/Kconfig
@@ -0,0 +1,121 @@
+menuconfig IP_SET
+ tristate "IP set support"
+ depends on INET && NETFILTER
+ help
+ This option adds IP set support to the kernel.
+ In order to define and use the sets, you need the userspace utility
+ ipset(8). You can use the sets in netfilter via the "set" match
+ and "SET" target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_SET
+
+config IP_SET_MAX
+ int "Maximum number of IP sets"
+ default 256
+ range 2 65534
+ depends on IP_SET
+ help
+ You can define here default value of the maximum number
+ of IP sets for the kernel.
+
+ The value can be overriden by the 'max_sets' module
+ parameter of the 'ip_set' module.
+
+config IP_SET_BITMAP_IP
+ tristate "bitmap:ip set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip set type support, by which one
+ can store IPv4 addresses (or network addresse) from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_IPMAC
+ tristate "bitmap:ip,mac set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip,mac set type support, by which one
+ can store IPv4 address and (source) MAC address pairs from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_PORT
+ tristate "bitmap:port set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:port set type support, by which one
+ can store TCP/UDP port numbers from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IP
+ tristate "hash:ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip set type support, by which one
+ can store arbitrary IPv4 or IPv6 addresses (or network addresses)
+ in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORT
+ tristate "hash:ip,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port set type support, by which one
+ can store IPv4/IPv6 address and protocol/port pairs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTIP
+ tristate "hash:ip,port,ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,ip set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ address triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTNET
+ tristate "hash:ip,port,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,net set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ network address/prefix triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NET
+ tristate "hash:net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net set type support, by which
+ one can store IPv4/IPv6 network address/prefix elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETPORT
+ tristate "hash:net,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,port set type support, by which
+ one can store IPv4/IPv6 network address/prefix and
+ protocol/port pairs as elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_LIST_SET
+ tristate "list:set set support"
+ depends on IP_SET
+ help
+ This option adds the list:set set type support. In this
+ kind of set one can store the name of other sets and it forms
+ an ordered union of the member sets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+endif # IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
new file mode 100644
index 00000000000..5adbdab67bd
--- /dev/null
+++ b/net/netfilter/ipset/Makefile
@@ -0,0 +1,24 @@
+#
+# Makefile for the ipset modules
+#
+
+ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o
+
+# ipset core
+obj-$(CONFIG_IP_SET) += ip_set.o
+
+# bitmap types
+obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o
+obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o
+obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
+
+# hash types
+obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
+obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o
+obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o
+obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o
+
+# list types
+obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
new file mode 100644
index 00000000000..bca96990218
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -0,0 +1,587 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+#define IP_SET_BITMAP_TIMEOUT
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("bitmap:ip type of IP sets");
+MODULE_ALIAS("ip_set_bitmap:ip");
+
+/* Type structure */
+struct bitmap_ip {
+ void *members; /* the set members */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ u32 hosts; /* number of hosts in a subnet */
+ size_t memsize; /* members size */
+ u8 netmask; /* subnet netmask */
+ u32 timeout; /* timeout parameter */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* Base variant */
+
+static inline u32
+ip_to_id(const struct bitmap_ip *m, u32 ip)
+{
+ return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
+}
+
+static int
+bitmap_ip_test(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_ip *map = set->data;
+ u16 id = *(u16 *)value;
+
+ return !!test_bit(id, map->members);
+}
+
+static int
+bitmap_ip_add(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ip *map = set->data;
+ u16 id = *(u16 *)value;
+
+ if (test_and_set_bit(id, map->members))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+static int
+bitmap_ip_del(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ip *map = set->data;
+ u16 id = *(u16 *)value;
+
+ if (!test_and_clear_bit(id, map->members))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+static int
+bitmap_ip_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_ip *map = set->data;
+ struct nlattr *atd, *nested;
+ u32 id, first = cb->args[2];
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] < map->elements; cb->args[2]++) {
+ id = cb->args[2];
+ if (!test_bit(id, map->members))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id * map->hosts));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+/* Timeout variant */
+
+static int
+bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_ip *map = set->data;
+ const unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+
+ return ip_set_timeout_test(members[id]);
+}
+
+static int
+bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ip *map = set->data;
+ unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+
+ if (ip_set_timeout_test(members[id]))
+ return -IPSET_ERR_EXIST;
+
+ members[id] = ip_set_timeout_set(timeout);
+
+ return 0;
+}
+
+static int
+bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ip *map = set->data;
+ unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+ int ret = -IPSET_ERR_EXIST;
+
+ if (ip_set_timeout_test(members[id]))
+ ret = 0;
+
+ members[id] = IPSET_ELEM_UNSET;
+ return ret;
+}
+
+static int
+bitmap_ip_tlist(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_ip *map = set->data;
+ struct nlattr *adt, *nested;
+ u32 id, first = cb->args[2];
+ const unsigned long *members = map->members;
+
+ adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!adt)
+ return -EMSGSIZE;
+ for (; cb->args[2] < map->elements; cb->args[2]++) {
+ id = cb->args[2];
+ if (!ip_set_timeout_test(members[id]))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, adt);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id * map->hosts));
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(members[id])));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, adt);
+
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, adt);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static int
+bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 ip;
+
+ ip = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC));
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ ip = ip_to_id(map, ip);
+
+ return adtfn(set, &ip, map->timeout);
+}
+
+static int
+bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 timeout = map->timeout;
+ u32 ip, ip_to, id;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(map->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST) {
+ id = ip_to_id(map, ip);
+ return adtfn(set, &id, timeout);
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to) {
+ swap(ip, ip_to);
+ if (ip < map->first_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ if (ip_to > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; !before(ip_to, ip); ip += map->hosts) {
+ id = ip_to_id(map, ip);
+ ret = adtfn(set, &id, timeout);;
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static void
+bitmap_ip_destroy(struct ip_set *set)
+{
+ struct bitmap_ip *map = set->data;
+
+ if (with_timeout(map->timeout))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+bitmap_ip_flush(struct ip_set *set)
+{
+ struct bitmap_ip *map = set->data;
+
+ memset(map->members, 0, map->memsize);
+}
+
+static int
+bitmap_ip_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct bitmap_ip *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip));
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
+ if (map->netmask != 32)
+ NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask);
+ NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES,
+ htonl(atomic_read(&set->ref) - 1));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) + map->memsize));
+ if (with_timeout(map->timeout))
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static bool
+bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ip *x = a->data;
+ const struct bitmap_ip *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ x->netmask == y->netmask &&
+ x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant bitmap_ip = {
+ .kadt = bitmap_ip_kadt,
+ .uadt = bitmap_ip_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_ip_add,
+ [IPSET_DEL] = bitmap_ip_del,
+ [IPSET_TEST] = bitmap_ip_test,
+ },
+ .destroy = bitmap_ip_destroy,
+ .flush = bitmap_ip_flush,
+ .head = bitmap_ip_head,
+ .list = bitmap_ip_list,
+ .same_set = bitmap_ip_same_set,
+};
+
+static const struct ip_set_type_variant bitmap_tip = {
+ .kadt = bitmap_ip_kadt,
+ .uadt = bitmap_ip_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_ip_tadd,
+ [IPSET_DEL] = bitmap_ip_tdel,
+ [IPSET_TEST] = bitmap_ip_ttest,
+ },
+ .destroy = bitmap_ip_destroy,
+ .flush = bitmap_ip_flush,
+ .head = bitmap_ip_head,
+ .list = bitmap_ip_tlist,
+ .same_set = bitmap_ip_same_set,
+};
+
+static void
+bitmap_ip_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct bitmap_ip *map = set->data;
+ unsigned long *table = map->members;
+ u32 id;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id < map->elements; id++)
+ if (ip_set_timeout_expired(table[id]))
+ table[id] = IPSET_ELEM_UNSET;
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+bitmap_ip_gc_init(struct ip_set *set)
+{
+ struct bitmap_ip *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = bitmap_ip_gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_ip(struct ip_set *set, struct bitmap_ip *map,
+ u32 first_ip, u32 last_ip,
+ u32 elements, u32 hosts, u8 netmask)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->elements = elements;
+ map->hosts = hosts;
+ map->netmask = netmask;
+ map->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = AF_INET;
+
+ return true;
+}
+
+static int
+bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct bitmap_ip *map;
+ u32 first_ip, last_ip, hosts, elements;
+ u8 netmask = 32;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ last_ip = first_ip | ~ip_set_hostmask(cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if (netmask > 32)
+ return -IPSET_ERR_INVALID_NETMASK;
+
+ first_ip &= ip_set_hostmask(netmask);
+ last_ip |= ~ip_set_hostmask(netmask);
+ }
+
+ if (netmask == 32) {
+ hosts = 1;
+ elements = last_ip - first_ip + 1;
+ } else {
+ u8 mask_bits;
+ u32 mask;
+
+ mask = range_to_mask(first_ip, last_ip, &mask_bits);
+
+ if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) ||
+ netmask <= mask_bits)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask);
+ hosts = 2 << (32 - netmask - 1);
+ elements = 2 << (netmask - mask_bits - 1);
+ }
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ pr_debug("hosts %u, elements %u\n", hosts, elements);
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->memsize = elements * sizeof(unsigned long);
+
+ if (!init_map_ip(set, map, first_ip, last_ip,
+ elements, hosts, netmask)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ set->variant = &bitmap_tip;
+
+ bitmap_ip_gc_init(set);
+ } else {
+ map->memsize = bitmap_bytes(0, elements - 1);
+
+ if (!init_map_ip(set, map, first_ip, last_ip,
+ elements, hosts, netmask)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ set->variant = &bitmap_ip;
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ip_type __read_mostly = {
+ .name = "bitmap:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_INET,
+ .revision = 0,
+ .create = bitmap_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ip_init(void)
+{
+ return ip_set_type_register(&bitmap_ip_type);
+}
+
+static void __exit
+bitmap_ip_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ip_type);
+}
+
+module_init(bitmap_ip_init);
+module_exit(bitmap_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
new file mode 100644
index 00000000000..5e790172def
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -0,0 +1,652 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip,mac type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("bitmap:ip,mac type of IP sets");
+MODULE_ALIAS("ip_set_bitmap:ip,mac");
+
+enum {
+ MAC_EMPTY, /* element is not set */
+ MAC_FILLED, /* element is set with MAC */
+ MAC_UNSET, /* element is set, without MAC */
+};
+
+/* Type structure */
+struct bitmap_ipmac {
+ void *members; /* the set members */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 timeout; /* timeout value */
+ struct timer_list gc; /* garbage collector */
+ size_t dsize; /* size of element */
+};
+
+/* ADT structure for generic function args */
+struct ipmac {
+ u32 id; /* id in array */
+ unsigned char *ether; /* ethernet address */
+};
+
+/* Member element without and with timeout */
+
+struct ipmac_elem {
+ unsigned char ether[ETH_ALEN];
+ unsigned char match;
+} __attribute__ ((aligned));
+
+struct ipmac_telem {
+ unsigned char ether[ETH_ALEN];
+ unsigned char match;
+ unsigned long timeout;
+} __attribute__ ((aligned));
+
+static inline void *
+bitmap_ipmac_elem(const struct bitmap_ipmac *map, u32 id)
+{
+ return (void *)((char *)map->members + id * map->dsize);
+}
+
+static inline bool
+bitmap_timeout(const struct bitmap_ipmac *map, u32 id)
+{
+ const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id);
+
+ return ip_set_timeout_test(elem->timeout);
+}
+
+static inline bool
+bitmap_expired(const struct bitmap_ipmac *map, u32 id)
+{
+ const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id);
+
+ return ip_set_timeout_expired(elem->timeout);
+}
+
+static inline int
+bitmap_ipmac_exist(const struct ipmac_telem *elem)
+{
+ return elem->match == MAC_UNSET ||
+ (elem->match == MAC_FILLED &&
+ !ip_set_timeout_expired(elem->timeout));
+}
+
+/* Base variant */
+
+static int
+bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+ switch (elem->match) {
+ case MAC_UNSET:
+ /* Trigger kernel to fill out the ethernet address */
+ return -EAGAIN;
+ case MAC_FILLED:
+ return data->ether == NULL ||
+ compare_ether_addr(data->ether, elem->ether) == 0;
+ }
+ return 0;
+}
+
+static int
+bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+ switch (elem->match) {
+ case MAC_UNSET:
+ if (!data->ether)
+ /* Already added without ethernet address */
+ return -IPSET_ERR_EXIST;
+ /* Fill the MAC address */
+ memcpy(elem->ether, data->ether, ETH_ALEN);
+ elem->match = MAC_FILLED;
+ break;
+ case MAC_FILLED:
+ return -IPSET_ERR_EXIST;
+ case MAC_EMPTY:
+ if (data->ether) {
+ memcpy(elem->ether, data->ether, ETH_ALEN);
+ elem->match = MAC_FILLED;
+ } else
+ elem->match = MAC_UNSET;
+ }
+
+ return 0;
+}
+
+static int
+bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+ if (elem->match == MAC_EMPTY)
+ return -IPSET_ERR_EXIST;
+
+ elem->match = MAC_EMPTY;
+
+ return 0;
+}
+
+static int
+bitmap_ipmac_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_ipmac *map = set->data;
+ const struct ipmac_elem *elem;
+ struct nlattr *atd, *nested;
+ u32 id, first = cb->args[2];
+ u32 last = map->last_ip - map->first_ip;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] <= last; cb->args[2]++) {
+ id = cb->args[2];
+ elem = bitmap_ipmac_elem(map, id);
+ if (elem->match == MAC_EMPTY)
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id));
+ if (elem->match == MAC_FILLED)
+ NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN,
+ elem->ether);
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+/* Timeout variant */
+
+static int
+bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+ switch (elem->match) {
+ case MAC_UNSET:
+ /* Trigger kernel to fill out the ethernet address */
+ return -EAGAIN;
+ case MAC_FILLED:
+ return (data->ether == NULL ||
+ compare_ether_addr(data->ether, elem->ether) == 0) &&
+ !bitmap_expired(map, data->id);
+ }
+ return 0;
+}
+
+static int
+bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id);
+
+ switch (elem->match) {
+ case MAC_UNSET:
+ if (!data->ether)
+ /* Already added without ethernet address */
+ return -IPSET_ERR_EXIST;
+ /* Fill the MAC address and activate the timer */
+ memcpy(elem->ether, data->ether, ETH_ALEN);
+ elem->match = MAC_FILLED;
+ if (timeout == map->timeout)
+ /* Timeout was not specified, get stored one */
+ timeout = elem->timeout;
+ elem->timeout = ip_set_timeout_set(timeout);
+ break;
+ case MAC_FILLED:
+ if (!bitmap_expired(map, data->id))
+ return -IPSET_ERR_EXIST;
+ /* Fall through */
+ case MAC_EMPTY:
+ if (data->ether) {
+ memcpy(elem->ether, data->ether, ETH_ALEN);
+ elem->match = MAC_FILLED;
+ } else
+ elem->match = MAC_UNSET;
+ /* If MAC is unset yet, we store plain timeout value
+ * because the timer is not activated yet
+ * and we can reuse it later when MAC is filled out,
+ * possibly by the kernel */
+ elem->timeout = data->ether ? ip_set_timeout_set(timeout)
+ : timeout;
+ break;
+ }
+
+ return 0;
+}
+
+static int
+bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id);
+
+ if (elem->match == MAC_EMPTY || bitmap_expired(map, data->id))
+ return -IPSET_ERR_EXIST;
+
+ elem->match = MAC_EMPTY;
+
+ return 0;
+}
+
+static int
+bitmap_ipmac_tlist(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_ipmac *map = set->data;
+ const struct ipmac_telem *elem;
+ struct nlattr *atd, *nested;
+ u32 id, first = cb->args[2];
+ u32 timeout, last = map->last_ip - map->first_ip;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] <= last; cb->args[2]++) {
+ id = cb->args[2];
+ elem = bitmap_ipmac_elem(map, id);
+ if (!bitmap_ipmac_exist(elem))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id));
+ if (elem->match == MAC_FILLED)
+ NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN,
+ elem->ether);
+ timeout = elem->match == MAC_UNSET ? elem->timeout
+ : ip_set_timeout_get(elem->timeout);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(timeout));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ return -EMSGSIZE;
+}
+
+static int
+bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct ipmac data;
+
+ data.id = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC));
+ if (data.id < map->first_ip || data.id > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ /* Backward compatibility: we don't check the second flag */
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ data.id -= map->first_ip;
+ data.ether = eth_hdr(skb)->h_source;
+
+ return adtfn(set, &data, map->timeout);
+}
+
+static int
+bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct ipmac data;
+ u32 timeout = map->timeout;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &data.id);
+ if (ret)
+ return ret;
+
+ if (data.id < map->first_ip || data.id > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ if (tb[IPSET_ATTR_ETHER])
+ data.ether = nla_data(tb[IPSET_ATTR_ETHER]);
+ else
+ data.ether = NULL;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(map->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ data.id -= map->first_ip;
+
+ ret = adtfn(set, &data, timeout);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static void
+bitmap_ipmac_destroy(struct ip_set *set)
+{
+ struct bitmap_ipmac *map = set->data;
+
+ if (with_timeout(map->timeout))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+bitmap_ipmac_flush(struct ip_set *set)
+{
+ struct bitmap_ipmac *map = set->data;
+
+ memset(map->members, 0,
+ (map->last_ip - map->first_ip + 1) * map->dsize);
+}
+
+static int
+bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct bitmap_ipmac *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip));
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
+ NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES,
+ htonl(atomic_read(&set->ref) - 1));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map)
+ + (map->last_ip - map->first_ip + 1) * map->dsize));
+ if (with_timeout(map->timeout))
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static bool
+bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ipmac *x = a->data;
+ const struct bitmap_ipmac *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant bitmap_ipmac = {
+ .kadt = bitmap_ipmac_kadt,
+ .uadt = bitmap_ipmac_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_ipmac_add,
+ [IPSET_DEL] = bitmap_ipmac_del,
+ [IPSET_TEST] = bitmap_ipmac_test,
+ },
+ .destroy = bitmap_ipmac_destroy,
+ .flush = bitmap_ipmac_flush,
+ .head = bitmap_ipmac_head,
+ .list = bitmap_ipmac_list,
+ .same_set = bitmap_ipmac_same_set,
+};
+
+static const struct ip_set_type_variant bitmap_tipmac = {
+ .kadt = bitmap_ipmac_kadt,
+ .uadt = bitmap_ipmac_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_ipmac_tadd,
+ [IPSET_DEL] = bitmap_ipmac_tdel,
+ [IPSET_TEST] = bitmap_ipmac_ttest,
+ },
+ .destroy = bitmap_ipmac_destroy,
+ .flush = bitmap_ipmac_flush,
+ .head = bitmap_ipmac_head,
+ .list = bitmap_ipmac_tlist,
+ .same_set = bitmap_ipmac_same_set,
+};
+
+static void
+bitmap_ipmac_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct bitmap_ipmac *map = set->data;
+ struct ipmac_telem *elem;
+ u32 id, last = map->last_ip - map->first_ip;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id <= last; id++) {
+ elem = bitmap_ipmac_elem(map, id);
+ if (elem->match == MAC_FILLED &&
+ ip_set_timeout_expired(elem->timeout))
+ elem->match = MAC_EMPTY;
+ }
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+bitmap_ipmac_gc_init(struct ip_set *set)
+{
+ struct bitmap_ipmac *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = bitmap_ipmac_gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create bitmap:ip,mac type of sets */
+
+static bool
+init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
+ u32 first_ip, u32 last_ip)
+{
+ map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize);
+ if (!map->members)
+ return false;
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = AF_INET;
+
+ return true;
+}
+
+static int
+bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ u32 first_ip, last_ip, elements;
+ struct bitmap_ipmac *map;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ last_ip = first_ip | ~ip_set_hostmask(cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ elements = last_ip - first_ip + 1;
+
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->dsize = sizeof(struct ipmac_telem);
+
+ if (!init_map_ipmac(set, map, first_ip, last_ip)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = &bitmap_tipmac;
+
+ bitmap_ipmac_gc_init(set);
+ } else {
+ map->dsize = sizeof(struct ipmac_elem);
+
+ if (!init_map_ipmac(set, map, first_ip, last_ip)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ set->variant = &bitmap_ipmac;
+
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ipmac_type = {
+ .name = "bitmap:ip,mac",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MAC,
+ .dimension = IPSET_DIM_TWO,
+ .family = AF_INET,
+ .revision = 0,
+ .create = bitmap_ipmac_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ipmac_init(void)
+{
+ return ip_set_type_register(&bitmap_ipmac_type);
+}
+
+static void __exit
+bitmap_ipmac_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ipmac_type);
+}
+
+module_init(bitmap_ipmac_init);
+module_exit(bitmap_ipmac_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
new file mode 100644
index 00000000000..165f09b1a9c
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -0,0 +1,515 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:port type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#define IP_SET_BITMAP_TIMEOUT
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("bitmap:port type of IP sets");
+MODULE_ALIAS("ip_set_bitmap:port");
+
+/* Type structure */
+struct bitmap_port {
+ void *members; /* the set members */
+ u16 first_port; /* host byte order, included in range */
+ u16 last_port; /* host byte order, included in range */
+ size_t memsize; /* members size */
+ u32 timeout; /* timeout parameter */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* Base variant */
+
+static int
+bitmap_port_test(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_port *map = set->data;
+ u16 id = *(u16 *)value;
+
+ return !!test_bit(id, map->members);
+}
+
+static int
+bitmap_port_add(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_port *map = set->data;
+ u16 id = *(u16 *)value;
+
+ if (test_and_set_bit(id, map->members))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+static int
+bitmap_port_del(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_port *map = set->data;
+ u16 id = *(u16 *)value;
+
+ if (!test_and_clear_bit(id, map->members))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+static int
+bitmap_port_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_port *map = set->data;
+ struct nlattr *atd, *nested;
+ u16 id, first = cb->args[2];
+ u16 last = map->last_port - map->first_port;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] <= last; cb->args[2]++) {
+ id = cb->args[2];
+ if (!test_bit(id, map->members))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT,
+ htons(map->first_port + id));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+/* Timeout variant */
+
+static int
+bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_port *map = set->data;
+ const unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+
+ return ip_set_timeout_test(members[id]);
+}
+
+static int
+bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_port *map = set->data;
+ unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+
+ if (ip_set_timeout_test(members[id]))
+ return -IPSET_ERR_EXIST;
+
+ members[id] = ip_set_timeout_set(timeout);
+
+ return 0;
+}
+
+static int
+bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_port *map = set->data;
+ unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+ int ret = -IPSET_ERR_EXIST;
+
+ if (ip_set_timeout_test(members[id]))
+ ret = 0;
+
+ members[id] = IPSET_ELEM_UNSET;
+ return ret;
+}
+
+static int
+bitmap_port_tlist(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_port *map = set->data;
+ struct nlattr *adt, *nested;
+ u16 id, first = cb->args[2];
+ u16 last = map->last_port - map->first_port;
+ const unsigned long *members = map->members;
+
+ adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!adt)
+ return -EMSGSIZE;
+ for (; cb->args[2] <= last; cb->args[2]++) {
+ id = cb->args[2];
+ if (!ip_set_timeout_test(members[id]))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, adt);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT,
+ htons(map->first_port + id));
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(members[id])));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, adt);
+
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, adt);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static int
+bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ __be16 __port;
+ u16 port = 0;
+
+ if (!ip_set_get_ip_port(skb, pf, flags & IPSET_DIM_ONE_SRC, &__port))
+ return -EINVAL;
+
+ port = ntohs(__port);
+
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ port -= map->first_port;
+
+ return adtfn(set, &port, map->timeout);
+}
+
+static int
+bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 timeout = map->timeout;
+ u32 port; /* wraparound */
+ u16 id, port_to;
+ int ret = 0;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(map->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST) {
+ id = port - map->first_port;
+ return adtfn(set, &id, timeout);
+ }
+
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to) {
+ swap(port, port_to);
+ if (port < map->first_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else
+ port_to = port;
+
+ if (port_to > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; port <= port_to; port++) {
+ id = port - map->first_port;
+ ret = adtfn(set, &id, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static void
+bitmap_port_destroy(struct ip_set *set)
+{
+ struct bitmap_port *map = set->data;
+
+ if (with_timeout(map->timeout))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+bitmap_port_flush(struct ip_set *set)
+{
+ struct bitmap_port *map = set->data;
+
+ memset(map->members, 0, map->memsize);
+}
+
+static int
+bitmap_port_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct bitmap_port *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port));
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
+ NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES,
+ htonl(atomic_read(&set->ref) - 1));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) + map->memsize));
+ if (with_timeout(map->timeout))
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static bool
+bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_port *x = a->data;
+ const struct bitmap_port *y = b->data;
+
+ return x->first_port == y->first_port &&
+ x->last_port == y->last_port &&
+ x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant bitmap_port = {
+ .kadt = bitmap_port_kadt,
+ .uadt = bitmap_port_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_port_add,
+ [IPSET_DEL] = bitmap_port_del,
+ [IPSET_TEST] = bitmap_port_test,
+ },
+ .destroy = bitmap_port_destroy,
+ .flush = bitmap_port_flush,
+ .head = bitmap_port_head,
+ .list = bitmap_port_list,
+ .same_set = bitmap_port_same_set,
+};
+
+static const struct ip_set_type_variant bitmap_tport = {
+ .kadt = bitmap_port_kadt,
+ .uadt = bitmap_port_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_port_tadd,
+ [IPSET_DEL] = bitmap_port_tdel,
+ [IPSET_TEST] = bitmap_port_ttest,
+ },
+ .destroy = bitmap_port_destroy,
+ .flush = bitmap_port_flush,
+ .head = bitmap_port_head,
+ .list = bitmap_port_tlist,
+ .same_set = bitmap_port_same_set,
+};
+
+static void
+bitmap_port_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct bitmap_port *map = set->data;
+ unsigned long *table = map->members;
+ u32 id; /* wraparound */
+ u16 last = map->last_port - map->first_port;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id <= last; id++)
+ if (ip_set_timeout_expired(table[id]))
+ table[id] = IPSET_ELEM_UNSET;
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+bitmap_port_gc_init(struct ip_set *set)
+{
+ struct bitmap_port *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = bitmap_port_gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_port(struct ip_set *set, struct bitmap_port *map,
+ u16 first_port, u16 last_port)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ map->first_port = first_port;
+ map->last_port = last_port;
+ map->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = AF_UNSPEC;
+
+ return true;
+}
+
+static int
+bitmap_port_create(struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ struct bitmap_port *map;
+ u16 first_port, last_port;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (first_port > last_port) {
+ u16 tmp = first_port;
+
+ first_port = last_port;
+ last_port = tmp;
+ }
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->memsize = (last_port - first_port + 1)
+ * sizeof(unsigned long);
+
+ if (!init_map_port(set, map, first_port, last_port)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ set->variant = &bitmap_tport;
+
+ bitmap_port_gc_init(set);
+ } else {
+ map->memsize = bitmap_bytes(0, last_port - first_port);
+ pr_debug("memsize: %zu\n", map->memsize);
+ if (!init_map_port(set, map, first_port, last_port)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ set->variant = &bitmap_port;
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_port_type = {
+ .name = "bitmap:port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = bitmap_port_create,
+ .create_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_port_init(void)
+{
+ return ip_set_type_register(&bitmap_port_type);
+}
+
+static void __exit
+bitmap_port_fini(void)
+{
+ ip_set_type_unregister(&bitmap_port_type);
+}
+
+module_init(bitmap_port_init);
+module_exit(bitmap_port_fini);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
new file mode 100644
index 00000000000..8b1a54c1e40
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -0,0 +1,1671 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module for IP set management */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/version.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/ipset/ip_set.h>
+
+static LIST_HEAD(ip_set_type_list); /* all registered set types */
+static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */
+
+static struct ip_set **ip_set_list; /* all individual sets */
+static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */
+
+#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0)
+
+static unsigned int max_sets;
+
+module_param(max_sets, int, 0600);
+MODULE_PARM_DESC(max_sets, "maximal number of sets");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("core IP set support");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
+
+/*
+ * The set types are implemented in modules and registered set types
+ * can be found in ip_set_type_list. Adding/deleting types is
+ * serialized by ip_set_type_mutex.
+ */
+
+static inline void
+ip_set_type_lock(void)
+{
+ mutex_lock(&ip_set_type_mutex);
+}
+
+static inline void
+ip_set_type_unlock(void)
+{
+ mutex_unlock(&ip_set_type_mutex);
+}
+
+/* Register and deregister settype */
+
+static struct ip_set_type *
+find_set_type(const char *name, u8 family, u8 revision)
+{
+ struct ip_set_type *type;
+
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family || type->family == AF_UNSPEC) &&
+ type->revision == revision)
+ return type;
+ return NULL;
+}
+
+/* Unlock, try to load a set type module and lock again */
+static int
+try_to_load_type(const char *name)
+{
+ nfnl_unlock();
+ pr_debug("try to load ip_set_%s\n", name);
+ if (request_module("ip_set_%s", name) < 0) {
+ pr_warning("Can't find ip_set type %s\n", name);
+ nfnl_lock();
+ return -IPSET_ERR_FIND_TYPE;
+ }
+ nfnl_lock();
+ return -EAGAIN;
+}
+
+/* Find a set type and reference it */
+static int
+find_set_type_get(const char *name, u8 family, u8 revision,
+ struct ip_set_type **found)
+{
+ rcu_read_lock();
+ *found = find_set_type(name, family, revision);
+ if (*found) {
+ int err = !try_module_get((*found)->me);
+ rcu_read_unlock();
+ return err ? -EFAULT : 0;
+ }
+ rcu_read_unlock();
+
+ return try_to_load_type(name);
+}
+
+/* Find a given set type by name and family.
+ * If we succeeded, the supported minimal and maximum revisions are
+ * filled out.
+ */
+static int
+find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max)
+{
+ struct ip_set_type *type;
+ bool found = false;
+
+ *min = *max = 0;
+ rcu_read_lock();
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family || type->family == AF_UNSPEC)) {
+ found = true;
+ if (type->revision < *min)
+ *min = type->revision;
+ else if (type->revision > *max)
+ *max = type->revision;
+ }
+ rcu_read_unlock();
+ if (found)
+ return 0;
+
+ return try_to_load_type(name);
+}
+
+#define family_name(f) ((f) == AF_INET ? "inet" : \
+ (f) == AF_INET6 ? "inet6" : "any")
+
+/* Register a set type structure. The type is identified by
+ * the unique triple of name, family and revision.
+ */
+int
+ip_set_type_register(struct ip_set_type *type)
+{
+ int ret = 0;
+
+ if (type->protocol != IPSET_PROTOCOL) {
+ pr_warning("ip_set type %s, family %s, revision %u uses "
+ "wrong protocol version %u (want %u)\n",
+ type->name, family_name(type->family),
+ type->revision, type->protocol, IPSET_PROTOCOL);
+ return -EINVAL;
+ }
+
+ ip_set_type_lock();
+ if (find_set_type(type->name, type->family, type->revision)) {
+ /* Duplicate! */
+ pr_warning("ip_set type %s, family %s, revision %u "
+ "already registered!\n", type->name,
+ family_name(type->family), type->revision);
+ ret = -EINVAL;
+ goto unlock;
+ }
+ list_add_rcu(&type->list, &ip_set_type_list);
+ pr_debug("type %s, family %s, revision %u registered.\n",
+ type->name, family_name(type->family), type->revision);
+unlock:
+ ip_set_type_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_type_register);
+
+/* Unregister a set type. There's a small race with ip_set_create */
+void
+ip_set_type_unregister(struct ip_set_type *type)
+{
+ ip_set_type_lock();
+ if (!find_set_type(type->name, type->family, type->revision)) {
+ pr_warning("ip_set type %s, family %s, revision %u "
+ "not registered\n", type->name,
+ family_name(type->family), type->revision);
+ goto unlock;
+ }
+ list_del_rcu(&type->list);
+ pr_debug("type %s, family %s, revision %u unregistered.\n",
+ type->name, family_name(type->family), type->revision);
+unlock:
+ ip_set_type_unlock();
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ip_set_type_unregister);
+
+/* Utility functions */
+void *
+ip_set_alloc(size_t size)
+{
+ void *members = NULL;
+
+ if (size < KMALLOC_MAX_SIZE)
+ members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+
+ if (members) {
+ pr_debug("%p: allocated with kmalloc\n", members);
+ return members;
+ }
+
+ members = vzalloc(size);
+ if (!members)
+ return NULL;
+ pr_debug("%p: allocated with vmalloc\n", members);
+
+ return members;
+}
+EXPORT_SYMBOL_GPL(ip_set_alloc);
+
+void
+ip_set_free(void *members)
+{
+ pr_debug("%p: free with %s\n", members,
+ is_vmalloc_addr(members) ? "vfree" : "kfree");
+ if (is_vmalloc_addr(members))
+ vfree(members);
+ else
+ kfree(members);
+}
+EXPORT_SYMBOL_GPL(ip_set_free);
+
+static inline bool
+flag_nested(const struct nlattr *nla)
+{
+ return nla->nla_type & NLA_F_NESTED;
+}
+
+static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
+ [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 },
+ [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+};
+
+int
+ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4)))
+ return -IPSET_ERR_PROTOCOL;
+
+ *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
+
+int
+ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6)))
+ return -IPSET_ERR_PROTOCOL;
+
+ memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
+ sizeof(struct in6_addr));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
+
+/*
+ * Creating/destroying/renaming/swapping affect the existence and
+ * the properties of a set. All of these can be executed from userspace
+ * only and serialized by the nfnl mutex indirectly from nfnetlink.
+ *
+ * Sets are identified by their index in ip_set_list and the index
+ * is used by the external references (set/SET netfilter modules).
+ *
+ * The set behind an index may change by swapping only, from userspace.
+ */
+
+static inline void
+__ip_set_get(ip_set_id_t index)
+{
+ atomic_inc(&ip_set_list[index]->ref);
+}
+
+static inline void
+__ip_set_put(ip_set_id_t index)
+{
+ atomic_dec(&ip_set_list[index]->ref);
+}
+
+/*
+ * Add, del and test set entries from kernel.
+ *
+ * The set behind the index must exist and must be referenced
+ * so it can't be destroyed (or changed) under our foot.
+ */
+
+int
+ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
+ u8 family, u8 dim, u8 flags)
+{
+ struct ip_set *set = ip_set_list[index];
+ int ret = 0;
+
+ BUG_ON(set == NULL || atomic_read(&set->ref) == 0);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (dim < set->type->dimension ||
+ !(family == set->family || set->family == AF_UNSPEC))
+ return 0;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, IPSET_TEST, family, dim, flags);
+ read_unlock_bh(&set->lock);
+
+ if (ret == -EAGAIN) {
+ /* Type requests element to be completed */
+ pr_debug("element must be competed, ADD is triggered\n");
+ write_lock_bh(&set->lock);
+ set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags);
+ write_unlock_bh(&set->lock);
+ ret = 1;
+ }
+
+ /* Convert error codes to nomatch */
+ return (ret < 0 ? 0 : ret);
+}
+EXPORT_SYMBOL_GPL(ip_set_test);
+
+int
+ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
+ u8 family, u8 dim, u8 flags)
+{
+ struct ip_set *set = ip_set_list[index];
+ int ret;
+
+ BUG_ON(set == NULL || atomic_read(&set->ref) == 0);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (dim < set->type->dimension ||
+ !(family == set->family || set->family == AF_UNSPEC))
+ return 0;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_add);
+
+int
+ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
+ u8 family, u8 dim, u8 flags)
+{
+ struct ip_set *set = ip_set_list[index];
+ int ret = 0;
+
+ BUG_ON(set == NULL || atomic_read(&set->ref) == 0);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (dim < set->type->dimension ||
+ !(family == set->family || set->family == AF_UNSPEC))
+ return 0;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, IPSET_DEL, family, dim, flags);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_del);
+
+/*
+ * Find set by name, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex must already be activated.
+ */
+ip_set_id_t
+ip_set_get_byname(const char *name, struct ip_set **set)
+{
+ ip_set_id_t i, index = IPSET_INVALID_ID;
+ struct ip_set *s;
+
+ for (i = 0; i < ip_set_max; i++) {
+ s = ip_set_list[i];
+ if (s != NULL && STREQ(s->name, name)) {
+ __ip_set_get(i);
+ index = i;
+ *set = s;
+ }
+ }
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_byname);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ * The nfnl mutex must already be activated.
+ */
+void
+ip_set_put_byindex(ip_set_id_t index)
+{
+ if (ip_set_list[index] != NULL) {
+ BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0);
+ __ip_set_put(index);
+ }
+}
+EXPORT_SYMBOL_GPL(ip_set_put_byindex);
+
+/*
+ * Get the name of a set behind a set index.
+ * We assume the set is referenced, so it does exist and
+ * can't be destroyed. The set cannot be renamed due to
+ * the referencing either.
+ *
+ * The nfnl mutex must already be activated.
+ */
+const char *
+ip_set_name_byindex(ip_set_id_t index)
+{
+ const struct ip_set *set = ip_set_list[index];
+
+ BUG_ON(set == NULL);
+ BUG_ON(atomic_read(&set->ref) == 0);
+
+ /* Referenced, so it's safe */
+ return set->name;
+}
+EXPORT_SYMBOL_GPL(ip_set_name_byindex);
+
+/*
+ * Routines to call by external subsystems, which do not
+ * call nfnl_lock for us.
+ */
+
+/*
+ * Find set by name, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex is used in the function.
+ */
+ip_set_id_t
+ip_set_nfnl_get(const char *name)
+{
+ struct ip_set *s;
+ ip_set_id_t index;
+
+ nfnl_lock();
+ index = ip_set_get_byname(name, &s);
+ nfnl_unlock();
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_get);
+
+/*
+ * Find set by index, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex is used in the function.
+ */
+ip_set_id_t
+ip_set_nfnl_get_byindex(ip_set_id_t index)
+{
+ if (index > ip_set_max)
+ return IPSET_INVALID_ID;
+
+ nfnl_lock();
+ if (ip_set_list[index])
+ __ip_set_get(index);
+ else
+ index = IPSET_INVALID_ID;
+ nfnl_unlock();
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ * The nfnl mutex is used in the function.
+ */
+void
+ip_set_nfnl_put(ip_set_id_t index)
+{
+ nfnl_lock();
+ if (ip_set_list[index] != NULL) {
+ BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0);
+ __ip_set_put(index);
+ }
+ nfnl_unlock();
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
+
+/*
+ * Communication protocol with userspace over netlink.
+ *
+ * We already locked by nfnl_lock.
+ */
+
+static inline bool
+protocol_failed(const struct nlattr * const tb[])
+{
+ return !tb[IPSET_ATTR_PROTOCOL] ||
+ nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
+}
+
+static inline u32
+flag_exist(const struct nlmsghdr *nlh)
+{
+ return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST;
+}
+
+static struct nlmsghdr *
+start_msg(struct sk_buff *skb, u32 pid, u32 seq, unsigned int flags,
+ enum ipset_cmd cmd)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ nlh = nlmsg_put(skb, pid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
+ sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ return NULL;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ return nlh;
+}
+
+/* Create a set */
+
+static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1},
+ [IPSET_ATTR_REVISION] = { .type = NLA_U8 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+};
+
+static ip_set_id_t
+find_set_id(const char *name)
+{
+ ip_set_id_t i, index = IPSET_INVALID_ID;
+ const struct ip_set *set;
+
+ for (i = 0; index == IPSET_INVALID_ID && i < ip_set_max; i++) {
+ set = ip_set_list[i];
+ if (set != NULL && STREQ(set->name, name))
+ index = i;
+ }
+ return index;
+}
+
+static inline struct ip_set *
+find_set(const char *name)
+{
+ ip_set_id_t index = find_set_id(name);
+
+ return index == IPSET_INVALID_ID ? NULL : ip_set_list[index];
+}
+
+static int
+find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
+{
+ ip_set_id_t i;
+
+ *index = IPSET_INVALID_ID;
+ for (i = 0; i < ip_set_max; i++) {
+ if (ip_set_list[i] == NULL) {
+ if (*index == IPSET_INVALID_ID)
+ *index = i;
+ } else if (STREQ(name, ip_set_list[i]->name)) {
+ /* Name clash */
+ *set = ip_set_list[i];
+ return -EEXIST;
+ }
+ }
+ if (*index == IPSET_INVALID_ID)
+ /* No free slot remained */
+ return -IPSET_ERR_MAX_SETS;
+ return 0;
+}
+
+static int
+ip_set_create(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set, *clash;
+ ip_set_id_t index = IPSET_INVALID_ID;
+ struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
+ const char *name, *typename;
+ u8 family, revision;
+ u32 flags = flag_exist(nlh);
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_REVISION] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA]))))
+ return -IPSET_ERR_PROTOCOL;
+
+ name = nla_data(attr[IPSET_ATTR_SETNAME]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ revision = nla_get_u8(attr[IPSET_ATTR_REVISION]);
+ pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
+ name, typename, family_name(family), revision);
+
+ /*
+ * First, and without any locks, allocate and initialize
+ * a normal base set structure.
+ */
+ set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
+ if (!set)
+ return -ENOMEM;
+ rwlock_init(&set->lock);
+ strlcpy(set->name, name, IPSET_MAXNAMELEN);
+ atomic_set(&set->ref, 0);
+ set->family = family;
+
+ /*
+ * Next, check that we know the type, and take
+ * a reference on the type, to make sure it stays available
+ * while constructing our new set.
+ *
+ * After referencing the type, we try to create the type
+ * specific part of the set without holding any locks.
+ */
+ ret = find_set_type_get(typename, family, revision, &(set->type));
+ if (ret)
+ goto out;
+
+ /*
+ * Without holding any locks, create private part.
+ */
+ if (attr[IPSET_ATTR_DATA] &&
+ nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
+ set->type->create_policy)) {
+ ret = -IPSET_ERR_PROTOCOL;
+ goto put_out;
+ }
+
+ ret = set->type->create(set, tb, flags);
+ if (ret != 0)
+ goto put_out;
+
+ /* BTW, ret==0 here. */
+
+ /*
+ * Here, we have a valid, constructed set and we are protected
+ * by nfnl_lock. Find the first free index in ip_set_list and
+ * check clashing.
+ */
+ if ((ret = find_free_id(set->name, &index, &clash)) != 0) {
+ /* If this is the same set and requested, ignore error */
+ if (ret == -EEXIST &&
+ (flags & IPSET_FLAG_EXIST) &&
+ STREQ(set->type->name, clash->type->name) &&
+ set->type->family == clash->type->family &&
+ set->type->revision == clash->type->revision &&
+ set->variant->same_set(set, clash))
+ ret = 0;
+ goto cleanup;
+ }
+
+ /*
+ * Finally! Add our shiny new set to the list, and be done.
+ */
+ pr_debug("create: '%s' created with index %u!\n", set->name, index);
+ ip_set_list[index] = set;
+
+ return ret;
+
+cleanup:
+ set->variant->destroy(set);
+put_out:
+ module_put(set->type->me);
+out:
+ kfree(set);
+ return ret;
+}
+
+/* Destroy sets */
+
+static const struct nla_policy
+ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static void
+ip_set_destroy_set(ip_set_id_t index)
+{
+ struct ip_set *set = ip_set_list[index];
+
+ pr_debug("set: %s\n", set->name);
+ ip_set_list[index] = NULL;
+
+ /* Must call it without holding any lock */
+ set->variant->destroy(set);
+ module_put(set->type->me);
+ kfree(set);
+}
+
+static int
+ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ ip_set_id_t i;
+
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ /* References are protected by the nfnl mutex */
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < ip_set_max; i++) {
+ if (ip_set_list[i] != NULL &&
+ (atomic_read(&ip_set_list[i]->ref)))
+ return -IPSET_ERR_BUSY;
+ }
+ for (i = 0; i < ip_set_max; i++) {
+ if (ip_set_list[i] != NULL)
+ ip_set_destroy_set(i);
+ }
+ } else {
+ i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (i == IPSET_INVALID_ID)
+ return -ENOENT;
+ else if (atomic_read(&ip_set_list[i]->ref))
+ return -IPSET_ERR_BUSY;
+
+ ip_set_destroy_set(i);
+ }
+ return 0;
+}
+
+/* Flush sets */
+
+static void
+ip_set_flush_set(struct ip_set *set)
+{
+ pr_debug("set: %s\n", set->name);
+
+ write_lock_bh(&set->lock);
+ set->variant->flush(set);
+ write_unlock_bh(&set->lock);
+}
+
+static int
+ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ ip_set_id_t i;
+
+ if (unlikely(protocol_failed(attr)))
+ return -EPROTO;
+
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < ip_set_max; i++)
+ if (ip_set_list[i] != NULL)
+ ip_set_flush_set(ip_set_list[i]);
+ } else {
+ i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (i == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ ip_set_flush_set(ip_set_list[i]);
+ }
+
+ return 0;
+}
+
+/* Rename a set */
+
+static const struct nla_policy
+ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static int
+ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set;
+ const char *name2;
+ ip_set_id_t i;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+ if (atomic_read(&set->ref) != 0)
+ return -IPSET_ERR_REFERENCED;
+
+ name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
+ for (i = 0; i < ip_set_max; i++) {
+ if (ip_set_list[i] != NULL &&
+ STREQ(ip_set_list[i]->name, name2))
+ return -IPSET_ERR_EXIST_SETNAME2;
+ }
+ strncpy(set->name, name2, IPSET_MAXNAMELEN);
+
+ return 0;
+}
+
+/* Swap two sets so that name/index points to the other.
+ * References and set names are also swapped.
+ *
+ * We are protected by the nfnl mutex and references are
+ * manipulated only by holding the mutex. The kernel interfaces
+ * do not hold the mutex but the pointer settings are atomic
+ * so the ip_set_list always contains valid pointers to the sets.
+ */
+
+static int
+ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *from, *to;
+ ip_set_id_t from_id, to_id;
+ char from_name[IPSET_MAXNAMELEN];
+ u32 from_ref;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ from_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (from_id == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ to_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME2]));
+ if (to_id == IPSET_INVALID_ID)
+ return -IPSET_ERR_EXIST_SETNAME2;
+
+ from = ip_set_list[from_id];
+ to = ip_set_list[to_id];
+
+ /* Features must not change.
+ * Not an artifical restriction anymore, as we must prevent
+ * possible loops created by swapping in setlist type of sets. */
+ if (!(from->type->features == to->type->features &&
+ from->type->family == to->type->family))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ /* No magic here: ref munging protected by the nfnl_lock */
+ strncpy(from_name, from->name, IPSET_MAXNAMELEN);
+ from_ref = atomic_read(&from->ref);
+
+ strncpy(from->name, to->name, IPSET_MAXNAMELEN);
+ atomic_set(&from->ref, atomic_read(&to->ref));
+ strncpy(to->name, from_name, IPSET_MAXNAMELEN);
+ atomic_set(&to->ref, from_ref);
+
+ ip_set_list[from_id] = to;
+ ip_set_list[to_id] = from;
+
+ return 0;
+}
+
+/* List/save set data */
+
+#define DUMP_INIT 0L
+#define DUMP_ALL 1L
+#define DUMP_ONE 2L
+#define DUMP_LAST 3L
+
+static int
+ip_set_dump_done(struct netlink_callback *cb)
+{
+ if (cb->args[2]) {
+ pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name);
+ __ip_set_put((ip_set_id_t) cb->args[1]);
+ }
+ return 0;
+}
+
+static inline void
+dump_attrs(struct nlmsghdr *nlh)
+{
+ const struct nlattr *attr;
+ int rem;
+
+ pr_debug("dump nlmsg\n");
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) {
+ pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len);
+ }
+}
+
+static int
+dump_init(struct netlink_callback *cb)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
+ int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ ip_set_id_t index;
+
+ /* Second pass, so parser can't fail */
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
+
+ /* cb->args[0] : dump single set/all sets
+ * [1] : set index
+ * [..]: type specific
+ */
+
+ if (!cda[IPSET_ATTR_SETNAME]) {
+ cb->args[0] = DUMP_ALL;
+ return 0;
+ }
+
+ index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME]));
+ if (index == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ cb->args[0] = DUMP_ONE;
+ cb->args[1] = index;
+ return 0;
+}
+
+static int
+ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ ip_set_id_t index = IPSET_INVALID_ID, max;
+ struct ip_set *set = NULL;
+ struct nlmsghdr *nlh = NULL;
+ unsigned int flags = NETLINK_CB(cb->skb).pid ? NLM_F_MULTI : 0;
+ int ret = 0;
+
+ if (cb->args[0] == DUMP_INIT) {
+ ret = dump_init(cb);
+ if (ret < 0) {
+ nlh = nlmsg_hdr(cb->skb);
+ /* We have to create and send the error message
+ * manually :-( */
+ if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(cb->skb, nlh, ret);
+ return ret;
+ }
+ }
+
+ if (cb->args[1] >= ip_set_max)
+ goto out;
+
+ pr_debug("args[0]: %ld args[1]: %ld\n", cb->args[0], cb->args[1]);
+ max = cb->args[0] == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
+ for (; cb->args[1] < max; cb->args[1]++) {
+ index = (ip_set_id_t) cb->args[1];
+ set = ip_set_list[index];
+ if (set == NULL) {
+ if (cb->args[0] == DUMP_ONE) {
+ ret = -ENOENT;
+ goto out;
+ }
+ continue;
+ }
+ /* When dumping all sets, we must dump "sorted"
+ * so that lists (unions of sets) are dumped last.
+ */
+ if (cb->args[0] != DUMP_ONE &&
+ !((cb->args[0] == DUMP_ALL) ^
+ (set->type->features & IPSET_DUMP_LAST)))
+ continue;
+ pr_debug("List set: %s\n", set->name);
+ if (!cb->args[2]) {
+ /* Start listing: make sure set won't be destroyed */
+ pr_debug("reference set\n");
+ __ip_set_get(index);
+ }
+ nlh = start_msg(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, flags,
+ IPSET_CMD_LIST);
+ if (!nlh) {
+ ret = -EMSGSIZE;
+ goto release_refcount;
+ }
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+ NLA_PUT_STRING(skb, IPSET_ATTR_SETNAME, set->name);
+ switch (cb->args[2]) {
+ case 0:
+ /* Core header data */
+ NLA_PUT_STRING(skb, IPSET_ATTR_TYPENAME,
+ set->type->name);
+ NLA_PUT_U8(skb, IPSET_ATTR_FAMILY,
+ set->family);
+ NLA_PUT_U8(skb, IPSET_ATTR_REVISION,
+ set->type->revision);
+ ret = set->variant->head(set, skb);
+ if (ret < 0)
+ goto release_refcount;
+ /* Fall through and add elements */
+ default:
+ read_lock_bh(&set->lock);
+ ret = set->variant->list(set, skb, cb);
+ read_unlock_bh(&set->lock);
+ if (!cb->args[2]) {
+ /* Set is done, proceed with next one */
+ if (cb->args[0] == DUMP_ONE)
+ cb->args[1] = IPSET_INVALID_ID;
+ else
+ cb->args[1]++;
+ }
+ goto release_refcount;
+ }
+ }
+ goto out;
+
+nla_put_failure:
+ ret = -EFAULT;
+release_refcount:
+ /* If there was an error or set is done, release set */
+ if (ret || !cb->args[2]) {
+ pr_debug("release set %s\n", ip_set_list[index]->name);
+ __ip_set_put(index);
+ }
+
+ /* If we dump all sets, continue with dumping last ones */
+ if (cb->args[0] == DUMP_ALL && cb->args[1] >= max && !cb->args[2])
+ cb->args[0] = DUMP_LAST;
+
+out:
+ if (nlh) {
+ nlmsg_end(skb, nlh);
+ pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len);
+ dump_attrs(nlh);
+ }
+
+ return ret < 0 ? ret : skb->len;
+}
+
+static int
+ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ return netlink_dump_start(ctnl, skb, nlh,
+ ip_set_dump_start,
+ ip_set_dump_done);
+}
+
+/* Add, del and test */
+
+static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ADT] = { .type = NLA_NESTED },
+};
+
+static int
+call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
+ struct nlattr *tb[], enum ipset_adt adt,
+ u32 flags, bool use_lineno)
+{
+ int ret, retried = 0;
+ u32 lineno = 0;
+ bool eexist = flags & IPSET_FLAG_EXIST;
+
+ do {
+ write_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, adt, &lineno, flags);
+ write_unlock_bh(&set->lock);
+ } while (ret == -EAGAIN &&
+ set->variant->resize &&
+ (ret = set->variant->resize(set, retried++)) == 0);
+
+ if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
+ return 0;
+ if (lineno && use_lineno) {
+ /* Error in restore/batch mode: send back lineno */
+ struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb);
+ struct sk_buff *skb2;
+ struct nlmsgerr *errmsg;
+ size_t payload = sizeof(*errmsg) + nlmsg_len(nlh);
+ int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *cmdattr;
+ u32 *errline;
+
+ skb2 = nlmsg_new(payload, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+ rep = __nlmsg_put(skb2, NETLINK_CB(skb).pid,
+ nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+ errmsg = nlmsg_data(rep);
+ errmsg->error = ret;
+ memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+ cmdattr = (void *)&errmsg->msg + min_len;
+
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ cmdattr, nlh->nlmsg_len - min_len,
+ ip_set_adt_policy);
+
+ errline = nla_data(cda[IPSET_ATTR_LINENO]);
+
+ *errline = lineno;
+
+ netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ /* Signal netlink not to send its ACK/errmsg. */
+ return -EINTR;
+ }
+
+ return ret;
+}
+
+static int
+ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(*tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_DATA] == NULL ||
+ !flag_nested(attr[IPSET_ATTR_DATA])))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0);
+ read_unlock_bh(&set->lock);
+ /* Userspace can't trigger element to be re-added */
+ if (ret == -EAGAIN)
+ ret = 1;
+
+ return ret < 0 ? ret : ret > 0 ? 0 : -IPSET_ERR_EXIST;
+}
+
+/* Get headed data of a set */
+
+static int
+ip_set_header(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ const struct ip_set *set;
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ ip_set_id_t index;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ index = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (index == IPSET_INVALID_ID)
+ return -ENOENT;
+ set = ip_set_list[index];
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_HEADER);
+ if (!nlh2)
+ goto nlmsg_failure;
+ NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+ NLA_PUT_STRING(skb2, IPSET_ATTR_SETNAME, set->name);
+ NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, set->type->name);
+ NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, set->family);
+ NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, set->type->revision);
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get type data */
+
+static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_type(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ u8 family, min, max;
+ const char *typename;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ ret = find_set_type_minmax(typename, family, &min, &max);
+ if (ret)
+ return ret;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_TYPE);
+ if (!nlh2)
+ goto nlmsg_failure;
+ NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+ NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, typename);
+ NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, family);
+ NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, max);
+ NLA_PUT_U8(skb2, IPSET_ATTR_REVISION_MIN, min);
+ nlmsg_end(skb2, nlh2);
+
+ pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get protocol version */
+
+static const struct nla_policy
+ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ int ret = 0;
+
+ if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_PROTOCOL);
+ if (!nlh2)
+ goto nlmsg_failure;
+ NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
+ [IPSET_CMD_CREATE] = {
+ .call = ip_set_create,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_create_policy,
+ },
+ [IPSET_CMD_DESTROY] = {
+ .call = ip_set_destroy,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_FLUSH] = {
+ .call = ip_set_flush,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_RENAME] = {
+ .call = ip_set_rename,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_SWAP] = {
+ .call = ip_set_swap,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_LIST] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_SAVE] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_ADD] = {
+ .call = ip_set_uadd,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_DEL] = {
+ .call = ip_set_udel,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_TEST] = {
+ .call = ip_set_utest,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_HEADER] = {
+ .call = ip_set_header,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_TYPE] = {
+ .call = ip_set_type,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_type_policy,
+ },
+ [IPSET_CMD_PROTOCOL] = {
+ .call = ip_set_protocol,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_protocol_policy,
+ },
+};
+
+static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
+ .name = "ip_set",
+ .subsys_id = NFNL_SUBSYS_IPSET,
+ .cb_count = IPSET_MSG_MAX,
+ .cb = ip_set_netlink_subsys_cb,
+};
+
+/* Interface to iptables/ip6tables */
+
+static int
+ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
+{
+ unsigned *op;
+ void *data;
+ int copylen = *len, ret = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (optval != SO_IP_SET)
+ return -EBADF;
+ if (*len < sizeof(unsigned))
+ return -EINVAL;
+
+ data = vmalloc(*len);
+ if (!data)
+ return -ENOMEM;
+ if (copy_from_user(data, user, *len) != 0) {
+ ret = -EFAULT;
+ goto done;
+ }
+ op = (unsigned *) data;
+
+ if (*op < IP_SET_OP_VERSION) {
+ /* Check the version at the beginning of operations */
+ struct ip_set_req_version *req_version = data;
+ if (req_version->version != IPSET_PROTOCOL) {
+ ret = -EPROTO;
+ goto done;
+ }
+ }
+
+ switch (*op) {
+ case IP_SET_OP_VERSION: {
+ struct ip_set_req_version *req_version = data;
+
+ if (*len != sizeof(struct ip_set_req_version)) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ req_version->version = IPSET_PROTOCOL;
+ ret = copy_to_user(user, req_version,
+ sizeof(struct ip_set_req_version));
+ goto done;
+ }
+ case IP_SET_OP_GET_BYNAME: {
+ struct ip_set_req_get_set *req_get = data;
+
+ if (*len != sizeof(struct ip_set_req_get_set)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+ nfnl_lock();
+ req_get->set.index = find_set_id(req_get->set.name);
+ nfnl_unlock();
+ goto copy;
+ }
+ case IP_SET_OP_GET_BYINDEX: {
+ struct ip_set_req_get_set *req_get = data;
+
+ if (*len != sizeof(struct ip_set_req_get_set) ||
+ req_get->set.index >= ip_set_max) {
+ ret = -EINVAL;
+ goto done;
+ }
+ nfnl_lock();
+ strncpy(req_get->set.name,
+ ip_set_list[req_get->set.index]
+ ? ip_set_list[req_get->set.index]->name : "",
+ IPSET_MAXNAMELEN);
+ nfnl_unlock();
+ goto copy;
+ }
+ default:
+ ret = -EBADMSG;
+ goto done;
+ } /* end of switch(op) */
+
+copy:
+ ret = copy_to_user(user, data, copylen);
+
+done:
+ vfree(data);
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static struct nf_sockopt_ops so_set __read_mostly = {
+ .pf = PF_INET,
+ .get_optmin = SO_IP_SET,
+ .get_optmax = SO_IP_SET + 1,
+ .get = &ip_set_sockfn_get,
+ .owner = THIS_MODULE,
+};
+
+static int __init
+ip_set_init(void)
+{
+ int ret;
+
+ if (max_sets)
+ ip_set_max = max_sets;
+ if (ip_set_max >= IPSET_INVALID_ID)
+ ip_set_max = IPSET_INVALID_ID - 1;
+
+ ip_set_list = kzalloc(sizeof(struct ip_set *) * ip_set_max,
+ GFP_KERNEL);
+ if (!ip_set_list) {
+ pr_err("ip_set: Unable to create ip_set_list\n");
+ return -ENOMEM;
+ }
+
+ ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+ if (ret != 0) {
+ pr_err("ip_set: cannot register with nfnetlink.\n");
+ kfree(ip_set_list);
+ return ret;
+ }
+ ret = nf_register_sockopt(&so_set);
+ if (ret != 0) {
+ pr_err("SO_SET registry failed: %d\n", ret);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ kfree(ip_set_list);
+ return ret;
+ }
+
+ pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL);
+ return 0;
+}
+
+static void __exit
+ip_set_fini(void)
+{
+ /* There can't be any existing set */
+ nf_unregister_sockopt(&so_set);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ kfree(ip_set_list);
+ pr_debug("these are the famous last words\n");
+}
+
+module_init(ip_set_init);
+module_exit(ip_set_fini);
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
new file mode 100644
index 00000000000..8d522721268
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -0,0 +1,141 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Get Layer-4 data from the packets */
+
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/ipset/ip_set_getport.h>
+
+/* We must handle non-linear skbs */
+static bool
+get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
+ bool src, __be16 *port, u8 *proto)
+{
+ switch (protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? th->source : th->dest;
+ break;
+ }
+ case IPPROTO_UDP: {
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph);
+ if (uh == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? uh->source : uh->dest;
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr _ich;
+ const struct icmphdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)htons((ic->type << 8) | ic->code);
+ break;
+ }
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _ich;
+ const struct icmp6hdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)
+ htons((ic->icmp6_type << 8) | ic->icmp6_code);
+ break;
+ }
+ default:
+ break;
+ }
+ *proto = protocol;
+
+ return true;
+}
+
+bool
+ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned int protooff = ip_hdrlen(skb);
+ int protocol = iph->protocol;
+
+ /* See comments at tcp_match in ip_tables.c */
+ if (protocol <= 0 || (ntohs(iph->frag_off) & IP_OFFSET))
+ return false;
+
+ return get_port(skb, protocol, protooff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip4_port);
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+bool
+ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ int protoff;
+ u8 nexthdr;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+ if (protoff < 0)
+ return false;
+
+ return get_port(skb, nexthdr, protoff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip6_port);
+#endif
+
+bool
+ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
+{
+ bool ret;
+ u8 proto;
+
+ switch (pf) {
+ case AF_INET:
+ ret = ip_set_get_ip4_port(skb, src, port, &proto);
+ break;
+ case AF_INET6:
+ ret = ip_set_get_ip6_port(skb, src, port, &proto);
+ break;
+ default:
+ return false;
+ }
+ if (!ret)
+ return ret;
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return true;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip_port);
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
new file mode 100644
index 00000000000..43bcce20012
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -0,0 +1,464 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip");
+
+/* Type specific function prefix */
+#define TYPE hash_ip
+
+static bool
+hash_ip_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ip4_same_set hash_ip_same_set
+#define hash_ip6_same_set hash_ip_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ip4_elem {
+ __be32 ip;
+};
+
+/* Member elements with timeout support */
+struct hash_ip4_telem {
+ __be32 ip;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ip4_data_equal(const struct hash_ip4_elem *ip1,
+ const struct hash_ip4_elem *ip2)
+{
+ return ip1->ip == ip2->ip;
+}
+
+static inline bool
+hash_ip4_data_isnull(const struct hash_ip4_elem *elem)
+{
+ return elem->ip == 0;
+}
+
+static inline void
+hash_ip4_data_copy(struct hash_ip4_elem *dst, const struct hash_ip4_elem *src)
+{
+ dst->ip = src->ip;
+}
+
+/* Zero valued IP addresses cannot be stored */
+static inline void
+hash_ip4_data_zero_out(struct hash_ip4_elem *elem)
+{
+ elem->ip = 0;
+}
+
+static inline bool
+hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ip4_data_tlist(struct sk_buff *skb, const struct hash_ip4_elem *data)
+{
+ const struct hash_ip4_telem *tdata =
+ (const struct hash_ip4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define IP_SET_HASH_WITH_NETMASK
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ __be32 ip;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip);
+ ip &= ip_set_netmask(h->netmask);
+ if (ip == 0)
+ return -EINVAL;
+
+ return adtfn(set, &ip, h->timeout);
+}
+
+static int
+hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 ip, ip_to, hosts, timeout = h->timeout;
+ __be32 nip;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ip &= ip_set_hostmask(h->netmask);
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST) {
+ nip = htonl(ip);
+ if (nip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ return adtfn(set, &nip, timeout);
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
+
+ for (; !before(ip_to, ip); ip += hosts) {
+ nip = htonl(ip);
+ if (nip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ ret = adtfn(set, &nip, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_ip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout &&
+ x->netmask == y->netmask;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ip6_elem {
+ union nf_inet_addr ip;
+};
+
+struct hash_ip6_telem {
+ union nf_inet_addr ip;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
+ const struct hash_ip6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0;
+}
+
+static inline bool
+hash_ip6_data_isnull(const struct hash_ip6_elem *elem)
+{
+ return ipv6_addr_any(&elem->ip.in6);
+}
+
+static inline void
+hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src)
+{
+ ipv6_addr_copy(&dst->ip.in6, &src->ip.in6);
+}
+
+static inline void
+hash_ip6_data_zero_out(struct hash_ip6_elem *elem)
+{
+ ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0);
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+ ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+ ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+ ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static bool
+hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ip6_data_tlist(struct sk_buff *skb, const struct hash_ip6_elem *data)
+{
+ const struct hash_ip6_telem *e =
+ (const struct hash_ip6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ union nf_inet_addr ip;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip.in6);
+ ip6_netmask(&ip, h->netmask);
+ if (ipv6_addr_any(&ip.in6))
+ return -EINVAL;
+
+ return adtfn(set, &ip, h->timeout);
+}
+
+static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+};
+
+static int
+hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ union nf_inet_addr ip;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ip6_netmask(&ip, h->netmask);
+ if (ipv6_addr_any(&ip.in6))
+ return -IPSET_ERR_HASH_ELEM;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ ret = adtfn(set, &ip, timeout);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 netmask, hbits;
+ struct ip_set_hash *h;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+ netmask = set->family == AF_INET ? 32 : 128;
+ pr_debug("Create set %s with family %s\n",
+ set->name, set->family == AF_INET ? "inet" : "inet6");
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if ((set->family == AF_INET && netmask > 32) ||
+ (set->family == AF_INET6 && netmask > 128) ||
+ netmask == 0)
+ return -IPSET_ERR_INVALID_NETMASK;
+ }
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ h->netmask = netmask;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_ip4_tvariant : &hash_ip6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_ip4_gc_init(set);
+ else
+ hash_ip6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_ip4_variant : &hash_ip6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_ip_type __read_mostly = {
+ .name = "hash:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ip_init(void)
+{
+ return ip_set_type_register(&hash_ip_type);
+}
+
+static void __exit
+hash_ip_fini(void)
+{
+ ip_set_type_unregister(&hash_ip_type);
+}
+
+module_init(hash_ip_init);
+module_exit(hash_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
new file mode 100644
index 00000000000..adbe787ea5d
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -0,0 +1,544 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip,port type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip,port");
+
+/* Type specific function prefix */
+#define TYPE hash_ipport
+
+static bool
+hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ipport4_same_set hash_ipport_same_set
+#define hash_ipport6_same_set hash_ipport_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ipport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Member elements with timeout support */
+struct hash_ipport4_telem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
+ const struct hash_ipport4_elem *ip2)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipport4_data_isnull(const struct hash_ipport4_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipport4_data_copy(struct hash_ipport4_elem *dst,
+ const struct hash_ipport4_elem *src)
+{
+ dst->ip = src->ip;
+ dst->port = src->port;
+ dst->proto = src->proto;
+}
+
+static inline void
+hash_ipport4_data_zero_out(struct hash_ipport4_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipport4_data_list(struct sk_buff *skb,
+ const struct hash_ipport4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipport4_data_tlist(struct sk_buff *skb,
+ const struct hash_ipport4_elem *data)
+{
+ const struct hash_ipport4_telem *tdata =
+ (const struct hash_ipport4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem data = { };
+
+ if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem data = { };
+ u32 ip, ip_to, p, port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMP:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip = ntohl(data.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ port = ntohs(data.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ } else
+ port_to = port;
+
+ for (; !before(ip_to, ip); ip++)
+ for (p = port; p <= port_to; p++) {
+ data.ip = htonl(ip);
+ data.port = htons(p);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ipport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+struct hash_ipport6_telem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
+ const struct hash_ipport6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipport6_data_isnull(const struct hash_ipport6_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipport6_data_copy(struct hash_ipport6_elem *dst,
+ const struct hash_ipport6_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipport6_data_zero_out(struct hash_ipport6_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipport6_data_list(struct sk_buff *skb,
+ const struct hash_ipport6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipport6_data_tlist(struct sk_buff *skb,
+ const struct hash_ipport6_elem *data)
+{
+ const struct hash_ipport6_telem *e =
+ (const struct hash_ipport6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem data = { };
+
+ if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem data = { };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMPV6:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ipport_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct ip_set_hash *h;
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_ipport4_tvariant : &hash_ipport6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_ipport4_gc_init(set);
+ else
+ hash_ipport6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_ipport4_variant : &hash_ipport6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_ipport_type __read_mostly = {
+ .name = "hash:ip,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_TWO,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_ipport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipport_init(void)
+{
+ return ip_set_type_register(&hash_ipport_type);
+}
+
+static void __exit
+hash_ipport_fini(void)
+{
+ ip_set_type_unregister(&hash_ipport_type);
+}
+
+module_init(hash_ipport_init);
+module_exit(hash_ipport_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
new file mode 100644
index 00000000000..22e23abb86c
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -0,0 +1,562 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip,port,ip type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip,port,ip");
+
+/* Type specific function prefix */
+#define TYPE hash_ipportip
+
+static bool
+hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ipportip4_same_set hash_ipportip_same_set
+#define hash_ipportip6_same_set hash_ipportip_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ipportip4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Member elements with timeout support */
+struct hash_ipportip4_telem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
+ const struct hash_ipportip4_elem *ip2)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportip4_data_isnull(const struct hash_ipportip4_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipportip4_data_copy(struct hash_ipportip4_elem *dst,
+ const struct hash_ipportip4_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportip4_data_zero_out(struct hash_ipportip4_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipportip4_data_list(struct sk_buff *skb,
+ const struct hash_ipportip4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipportip4_data_tlist(struct sk_buff *skb,
+ const struct hash_ipportip4_elem *data)
+{
+ const struct hash_ipportip4_telem *tdata =
+ (const struct hash_ipportip4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem data = { };
+
+ if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+ ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem data = { };
+ u32 ip, ip_to, p, port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMP:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip = ntohl(data.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ port = ntohs(data.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ } else
+ port_to = port;
+
+ for (; !before(ip_to, ip); ip++)
+ for (p = port; p <= port_to; p++) {
+ data.ip = htonl(ip);
+ data.port = htons(p);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ipportip6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+struct hash_ipportip6_telem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
+ const struct hash_ipportip6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportip6_data_isnull(const struct hash_ipportip6_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipportip6_data_copy(struct hash_ipportip6_elem *dst,
+ const struct hash_ipportip6_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportip6_data_zero_out(struct hash_ipportip6_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipportip6_data_list(struct sk_buff *skb,
+ const struct hash_ipportip6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipportip6_data_tlist(struct sk_buff *skb,
+ const struct hash_ipportip6_elem *data)
+{
+ const struct hash_ipportip6_telem *e =
+ (const struct hash_ipportip6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem data = { };
+
+ if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+ ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem data = { };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMPV6:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ipportip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct ip_set_hash *h;
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_ipportip4_tvariant : &hash_ipportip6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_ipportip4_gc_init(set);
+ else
+ hash_ipportip6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_ipportip4_variant : &hash_ipportip6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_ipportip_type __read_mostly = {
+ .name = "hash:ip,port,ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
+ .dimension = IPSET_DIM_THREE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_ipportip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportip_init(void)
+{
+ return ip_set_type_register(&hash_ipportip_type);
+}
+
+static void __exit
+hash_ipportip_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportip_type);
+}
+
+module_init(hash_ipportip_init);
+module_exit(hash_ipportip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
new file mode 100644
index 00000000000..6033e8b54bb
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -0,0 +1,628 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip,port,net type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip,port,net");
+
+/* Type specific function prefix */
+#define TYPE hash_ipportnet
+
+static bool
+hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ipportnet4_same_set hash_ipportnet_same_set
+#define hash_ipportnet6_same_set hash_ipportnet_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ipportnet4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 cidr;
+ u8 proto;
+};
+
+/* Member elements with timeout support */
+struct hash_ipportnet4_telem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 cidr;
+ u8 proto;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
+ const struct hash_ipportnet4_elem *ip2)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportnet4_data_isnull(const struct hash_ipportnet4_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipportnet4_data_copy(struct hash_ipportnet4_elem *dst,
+ const struct hash_ipportnet4_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr)
+{
+ elem->ip2 &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static inline void
+hash_ipportnet4_data_zero_out(struct hash_ipportnet4_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipportnet4_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipportnet4_data_tlist(struct sk_buff *skb,
+ const struct hash_ipportnet4_elem *data)
+{
+ const struct hash_ipportnet4_telem *tdata =
+ (const struct hash_ipportnet4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem data =
+ { .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+ ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
+ data.ip2 &= ip_set_netmask(data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem data = { .cidr = HOST_MASK };
+ u32 ip, ip_to, p, port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ data.ip2 &= ip_set_netmask(data.cidr);
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMP:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip = ntohl(data.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ port = ntohs(data.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ } else
+ port_to = port;
+
+ for (; !before(ip_to, ip); ip++)
+ for (p = port; p <= port_to; p++) {
+ data.ip = htonl(ip);
+ data.port = htons(p);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ipportnet6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 cidr;
+ u8 proto;
+};
+
+struct hash_ipportnet6_telem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 cidr;
+ u8 proto;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
+ const struct hash_ipportnet6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportnet6_data_isnull(const struct hash_ipportnet6_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipportnet6_data_copy(struct hash_ipportnet6_elem *dst,
+ const struct hash_ipportnet6_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+ ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+ ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+ ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip2, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_ipportnet6_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipportnet6_data_tlist(struct sk_buff *skb,
+ const struct hash_ipportnet6_elem *data)
+{
+ const struct hash_ipportnet6_telem *e =
+ (const struct hash_ipportnet6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem data =
+ { .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+ ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
+ ip6_netmask(&data.ip2, data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem data = { .cidr = HOST_MASK };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&data.ip2, data.cidr);
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMPV6:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ipportnet_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct ip_set_hash *h;
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h)
+ + sizeof(struct ip_set_hash_nets)
+ * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_ipportnet4_tvariant
+ : &hash_ipportnet6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_ipportnet4_gc_init(set);
+ else
+ hash_ipportnet6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_ipportnet4_variant : &hash_ipportnet6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_ipportnet_type __read_mostly = {
+ .name = "hash:ip,port,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
+ .dimension = IPSET_DIM_THREE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_ipportnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportnet_init(void)
+{
+ return ip_set_type_register(&hash_ipportnet_type);
+}
+
+static void __exit
+hash_ipportnet_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportnet_type);
+}
+
+module_init(hash_ipportnet_init);
+module_exit(hash_ipportnet_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
new file mode 100644
index 00000000000..c4db202b7da
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -0,0 +1,458 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:net type of IP sets");
+MODULE_ALIAS("ip_set_hash:net");
+
+/* Type specific function prefix */
+#define TYPE hash_net
+
+static bool
+hash_net_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_net4_same_set hash_net_same_set
+#define hash_net6_same_set hash_net_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_net4_elem {
+ __be32 ip;
+ u16 padding0;
+ u8 padding1;
+ u8 cidr;
+};
+
+/* Member elements with timeout support */
+struct hash_net4_telem {
+ __be32 ip;
+ u16 padding0;
+ u8 padding1;
+ u8 cidr;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_net4_data_equal(const struct hash_net4_elem *ip1,
+ const struct hash_net4_elem *ip2)
+{
+ return ip1->ip == ip2->ip && ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_net4_data_isnull(const struct hash_net4_elem *elem)
+{
+ return elem->cidr == 0;
+}
+
+static inline void
+hash_net4_data_copy(struct hash_net4_elem *dst,
+ const struct hash_net4_elem *src)
+{
+ dst->ip = src->ip;
+ dst->cidr = src->cidr;
+}
+
+static inline void
+hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+/* Zero CIDR values cannot be stored */
+static inline void
+hash_net4_data_zero_out(struct hash_net4_elem *elem)
+{
+ elem->cidr = 0;
+}
+
+static bool
+hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_net4_data_tlist(struct sk_buff *skb, const struct hash_net4_elem *data)
+{
+ const struct hash_net4_telem *tdata =
+ (const struct hash_net4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, tdata->cidr);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define IP_SET_HASH_WITH_NETS
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem data = { .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+ data.ip &= ip_set_netmask(data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem data = { .cidr = HOST_MASK };
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ data.ip &= ip_set_netmask(data.cidr);
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ ret = adtfn(set, &data, timeout);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static bool
+hash_net_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_net6_elem {
+ union nf_inet_addr ip;
+ u16 padding0;
+ u8 padding1;
+ u8 cidr;
+};
+
+struct hash_net6_telem {
+ union nf_inet_addr ip;
+ u16 padding0;
+ u8 padding1;
+ u8 cidr;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_net6_data_equal(const struct hash_net6_elem *ip1,
+ const struct hash_net6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_net6_data_isnull(const struct hash_net6_elem *elem)
+{
+ return elem->cidr == 0;
+}
+
+static inline void
+hash_net6_data_copy(struct hash_net6_elem *dst,
+ const struct hash_net6_elem *src)
+{
+ ipv6_addr_copy(&dst->ip.in6, &src->ip.in6);
+ dst->cidr = src->cidr;
+}
+
+static inline void
+hash_net6_data_zero_out(struct hash_net6_elem *elem)
+{
+ elem->cidr = 0;
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+ ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+ ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+ ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_net6_data_tlist(struct sk_buff *skb, const struct hash_net6_elem *data)
+{
+ const struct hash_net6_telem *e =
+ (const struct hash_net6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, e->cidr);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem data = { .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+ ip6_netmask(&data.ip, data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem data = { .cidr = HOST_MASK };
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&data.ip, data.cidr);
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ ret = adtfn(set, &data, timeout);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_net_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ struct ip_set_hash *h;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h)
+ + sizeof(struct ip_set_hash_nets)
+ * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_net4_tvariant : &hash_net6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_net4_gc_init(set);
+ else
+ hash_net6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_net4_variant : &hash_net6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_net_type __read_mostly = {
+ .name = "hash:net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_net_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_net_init(void)
+{
+ return ip_set_type_register(&hash_net_type);
+}
+
+static void __exit
+hash_net_fini(void)
+{
+ ip_set_type_unregister(&hash_net_type);
+}
+
+module_init(hash_net_init);
+module_exit(hash_net_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
new file mode 100644
index 00000000000..34a165626ee
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -0,0 +1,578 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:net,port type of IP sets");
+MODULE_ALIAS("ip_set_hash:net,port");
+
+/* Type specific function prefix */
+#define TYPE hash_netport
+
+static bool
+hash_netport_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_netport4_same_set hash_netport_same_set
+#define hash_netport6_same_set hash_netport_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_netport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr;
+};
+
+/* Member elements with timeout support */
+struct hash_netport4_telem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
+ const struct hash_netport4_elem *ip2)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_netport4_data_isnull(const struct hash_netport4_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_netport4_data_copy(struct hash_netport4_elem *dst,
+ const struct hash_netport4_elem *src)
+{
+ dst->ip = src->ip;
+ dst->port = src->port;
+ dst->proto = src->proto;
+ dst->cidr = src->cidr;
+}
+
+static inline void
+hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static inline void
+hash_netport4_data_zero_out(struct hash_netport4_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_netport4_data_list(struct sk_buff *skb,
+ const struct hash_netport4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_netport4_data_tlist(struct sk_buff *skb,
+ const struct hash_netport4_elem *data)
+{
+ const struct hash_netport4_telem *tdata =
+ (const struct hash_netport4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem data = {
+ .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+ data.ip &= ip_set_netmask(data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem data = { .cidr = HOST_MASK };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+ data.ip &= ip_set_netmask(data.cidr);
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMP:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_netport_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_netport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr;
+};
+
+struct hash_netport6_telem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
+ const struct hash_netport6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_netport6_data_isnull(const struct hash_netport6_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_netport6_data_copy(struct hash_netport6_elem *dst,
+ const struct hash_netport6_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_netport6_data_zero_out(struct hash_netport6_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+ ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+ ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+ ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_netport6_data_list(struct sk_buff *skb,
+ const struct hash_netport6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_netport6_data_tlist(struct sk_buff *skb,
+ const struct hash_netport6_elem *data)
+{
+ const struct hash_netport6_telem *e =
+ (const struct hash_netport6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem data = {
+ .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+ ip6_netmask(&data.ip, data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem data = { .cidr = HOST_MASK };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip6_netmask(&data.ip, data.cidr);
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMPV6:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_netport_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct ip_set_hash *h;
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h)
+ + sizeof(struct ip_set_hash_nets)
+ * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_netport4_tvariant : &hash_netport6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_netport4_gc_init(set);
+ else
+ hash_netport6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_netport4_variant : &hash_netport6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_netport_type __read_mostly = {
+ .name = "hash:net,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_TWO,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_netport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netport_init(void)
+{
+ return ip_set_type_register(&hash_netport_type);
+}
+
+static void __exit
+hash_netport_fini(void)
+{
+ ip_set_type_unregister(&hash_netport_type);
+}
+
+module_init(hash_netport_init);
+module_exit(hash_netport_fini);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
new file mode 100644
index 00000000000..a47c32982f0
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -0,0 +1,584 @@
+/* Copyright (C) 2008-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the list:set type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_list.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("list:set type of IP sets");
+MODULE_ALIAS("ip_set_list:set");
+
+/* Member elements without and with timeout */
+struct set_elem {
+ ip_set_id_t id;
+};
+
+struct set_telem {
+ ip_set_id_t id;
+ unsigned long timeout;
+};
+
+/* Type structure */
+struct list_set {
+ size_t dsize; /* element size */
+ u32 size; /* size of set list array */
+ u32 timeout; /* timeout value */
+ struct timer_list gc; /* garbage collection */
+ struct set_elem members[0]; /* the set members */
+};
+
+static inline struct set_elem *
+list_set_elem(const struct list_set *map, u32 id)
+{
+ return (struct set_elem *)((char *)map->members + id * map->dsize);
+}
+
+static inline bool
+list_set_timeout(const struct list_set *map, u32 id)
+{
+ const struct set_telem *elem =
+ (const struct set_telem *) list_set_elem(map, id);
+
+ return ip_set_timeout_test(elem->timeout);
+}
+
+static inline bool
+list_set_expired(const struct list_set *map, u32 id)
+{
+ const struct set_telem *elem =
+ (const struct set_telem *) list_set_elem(map, id);
+
+ return ip_set_timeout_expired(elem->timeout);
+}
+
+static inline int
+list_set_exist(const struct set_telem *elem)
+{
+ return elem->id != IPSET_INVALID_ID &&
+ !ip_set_timeout_expired(elem->timeout);
+}
+
+/* Set list without and with timeout */
+
+static int
+list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ struct list_set *map = set->data;
+ struct set_elem *elem;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == IPSET_INVALID_ID)
+ return 0;
+ if (with_timeout(map->timeout) && list_set_expired(map, i))
+ continue;
+ switch (adt) {
+ case IPSET_TEST:
+ ret = ip_set_test(elem->id, skb, pf, dim, flags);
+ if (ret > 0)
+ return ret;
+ break;
+ case IPSET_ADD:
+ ret = ip_set_add(elem->id, skb, pf, dim, flags);
+ if (ret == 0)
+ return ret;
+ break;
+ case IPSET_DEL:
+ ret = ip_set_del(elem->id, skb, pf, dim, flags);
+ if (ret == 0)
+ return ret;
+ break;
+ default:
+ break;
+ }
+ }
+ return -EINVAL;
+}
+
+static bool
+next_id_eq(const struct list_set *map, u32 i, ip_set_id_t id)
+{
+ const struct set_elem *elem;
+
+ if (i + 1 < map->size) {
+ elem = list_set_elem(map, i + 1);
+ return !!(elem->id == id &&
+ !(with_timeout(map->timeout) &&
+ list_set_expired(map, i + 1)));
+ }
+
+ return 0;
+}
+
+static void
+list_elem_add(struct list_set *map, u32 i, ip_set_id_t id)
+{
+ struct set_elem *e;
+
+ for (; i < map->size; i++) {
+ e = list_set_elem(map, i);
+ swap(e->id, id);
+ if (e->id == IPSET_INVALID_ID)
+ break;
+ }
+}
+
+static void
+list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id,
+ unsigned long timeout)
+{
+ struct set_telem *e;
+
+ for (; i < map->size; i++) {
+ e = (struct set_telem *)list_set_elem(map, i);
+ swap(e->id, id);
+ if (e->id == IPSET_INVALID_ID)
+ break;
+ swap(e->timeout, timeout);
+ }
+}
+
+static int
+list_set_add(struct list_set *map, u32 i, ip_set_id_t id,
+ unsigned long timeout)
+{
+ const struct set_elem *e = list_set_elem(map, i);
+
+ if (i == map->size - 1 && e->id != IPSET_INVALID_ID)
+ /* Last element replaced: e.g. add new,before,last */
+ ip_set_put_byindex(e->id);
+ if (with_timeout(map->timeout))
+ list_elem_tadd(map, i, id, timeout);
+ else
+ list_elem_add(map, i, id);
+
+ return 0;
+}
+
+static int
+list_set_del(struct list_set *map, ip_set_id_t id, u32 i)
+{
+ struct set_elem *a = list_set_elem(map, i), *b;
+
+ ip_set_put_byindex(id);
+
+ for (; i < map->size - 1; i++) {
+ b = list_set_elem(map, i + 1);
+ a->id = b->id;
+ if (with_timeout(map->timeout))
+ ((struct set_telem *)a)->timeout =
+ ((struct set_telem *)b)->timeout;
+ a = b;
+ if (a->id == IPSET_INVALID_ID)
+ break;
+ }
+ /* Last element */
+ a->id = IPSET_INVALID_ID;
+ return 0;
+}
+
+static int
+list_set_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ struct list_set *map = set->data;
+ bool with_timeout = with_timeout(map->timeout);
+ int before = 0;
+ u32 timeout = map->timeout;
+ ip_set_id_t id, refid = IPSET_INVALID_ID;
+ const struct set_elem *elem;
+ struct ip_set *s;
+ u32 i;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_NAME] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s);
+ if (id == IPSET_INVALID_ID)
+ return -IPSET_ERR_NAME;
+ /* "Loop detection" */
+ if (s->type->features & IPSET_TYPE_NAME) {
+ ret = -IPSET_ERR_LOOP;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ before = f & IPSET_FLAG_BEFORE;
+ }
+
+ if (before && !tb[IPSET_ATTR_NAMEREF]) {
+ ret = -IPSET_ERR_BEFORE;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_NAMEREF]) {
+ refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]),
+ &s);
+ if (refid == IPSET_INVALID_ID) {
+ ret = -IPSET_ERR_NAMEREF;
+ goto finish;
+ }
+ if (!before)
+ before = -1;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout) {
+ ret = -IPSET_ERR_TIMEOUT;
+ goto finish;
+ }
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ switch (adt) {
+ case IPSET_TEST:
+ for (i = 0; i < map->size && !ret; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == IPSET_INVALID_ID ||
+ (before != 0 && i + 1 >= map->size))
+ break;
+ else if (with_timeout && list_set_expired(map, i))
+ continue;
+ else if (before > 0 && elem->id == id)
+ ret = next_id_eq(map, i, refid);
+ else if (before < 0 && elem->id == refid)
+ ret = next_id_eq(map, i, id);
+ else if (before == 0 && elem->id == id)
+ ret = 1;
+ }
+ break;
+ case IPSET_ADD:
+ for (i = 0; i < map->size && !ret; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == id &&
+ !(with_timeout && list_set_expired(map, i)))
+ ret = -IPSET_ERR_EXIST;
+ }
+ if (ret == -IPSET_ERR_EXIST)
+ break;
+ ret = -IPSET_ERR_LIST_FULL;
+ for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == IPSET_INVALID_ID)
+ ret = before != 0 ? -IPSET_ERR_REF_EXIST
+ : list_set_add(map, i, id, timeout);
+ else if (elem->id != refid)
+ continue;
+ else if (with_timeout && list_set_expired(map, i))
+ ret = -IPSET_ERR_REF_EXIST;
+ else if (before)
+ ret = list_set_add(map, i, id, timeout);
+ else if (i + 1 < map->size)
+ ret = list_set_add(map, i + 1, id, timeout);
+ }
+ break;
+ case IPSET_DEL:
+ ret = -IPSET_ERR_EXIST;
+ for (i = 0; i < map->size && ret == -IPSET_ERR_EXIST; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == IPSET_INVALID_ID) {
+ ret = before != 0 ? -IPSET_ERR_REF_EXIST
+ : -IPSET_ERR_EXIST;
+ break;
+ } else if (with_timeout && list_set_expired(map, i))
+ continue;
+ else if (elem->id == id &&
+ (before == 0 ||
+ (before > 0 &&
+ next_id_eq(map, i, refid))))
+ ret = list_set_del(map, id, i);
+ else if (before < 0 &&
+ elem->id == refid &&
+ next_id_eq(map, i, id))
+ ret = list_set_del(map, id, i + 1);
+ }
+ break;
+ default:
+ break;
+ }
+
+finish:
+ if (refid != IPSET_INVALID_ID)
+ ip_set_put_byindex(refid);
+ if (adt != IPSET_ADD || ret)
+ ip_set_put_byindex(id);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static void
+list_set_flush(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+ struct set_elem *elem;
+ u32 i;
+
+ for (i = 0; i < map->size; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id != IPSET_INVALID_ID) {
+ ip_set_put_byindex(elem->id);
+ elem->id = IPSET_INVALID_ID;
+ }
+ }
+}
+
+static void
+list_set_destroy(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (with_timeout(map->timeout))
+ del_timer_sync(&map->gc);
+ list_set_flush(set);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static int
+list_set_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size));
+ if (with_timeout(map->timeout))
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+ NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES,
+ htonl(atomic_read(&set->ref) - 1));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) + map->size * map->dsize));
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int
+list_set_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *atd, *nested;
+ u32 i, first = cb->args[2];
+ const struct set_elem *e;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] < map->size; cb->args[2]++) {
+ i = cb->args[2];
+ e = list_set_elem(map, i);
+ if (e->id == IPSET_INVALID_ID)
+ goto finish;
+ if (with_timeout(map->timeout) && list_set_expired(map, i))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (i == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_STRING(skb, IPSET_ATTR_NAME,
+ ip_set_name_byindex(e->id));
+ if (with_timeout(map->timeout)) {
+ const struct set_telem *te =
+ (const struct set_telem *) e;
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(te->timeout)));
+ }
+ ipset_nest_end(skb, nested);
+ }
+finish:
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ if (unlikely(i == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static bool
+list_set_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct list_set *x = a->data;
+ const struct list_set *y = b->data;
+
+ return x->size == y->size &&
+ x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant list_set = {
+ .kadt = list_set_kadt,
+ .uadt = list_set_uadt,
+ .destroy = list_set_destroy,
+ .flush = list_set_flush,
+ .head = list_set_head,
+ .list = list_set_list,
+ .same_set = list_set_same_set,
+};
+
+static void
+list_set_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct list_set *map = set->data;
+ struct set_telem *e;
+ u32 i;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (i = map->size - 1; i >= 0; i--) {
+ e = (struct set_telem *) list_set_elem(map, i);
+ if (e->id != IPSET_INVALID_ID &&
+ list_set_expired(map, i))
+ list_set_del(map, e->id, i);
+ }
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+list_set_gc_init(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = list_set_gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create list:set type of sets */
+
+static bool
+init_list_set(struct ip_set *set, u32 size, size_t dsize,
+ unsigned long timeout)
+{
+ struct list_set *map;
+ struct set_elem *e;
+ u32 i;
+
+ map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL);
+ if (!map)
+ return false;
+
+ map->size = size;
+ map->dsize = dsize;
+ map->timeout = timeout;
+ set->data = map;
+
+ for (i = 0; i < size; i++) {
+ e = list_set_elem(map, i);
+ e->id = IPSET_INVALID_ID;
+ }
+
+ return true;
+}
+
+static int
+list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ u32 size = IP_SET_LIST_DEFAULT_SIZE;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_SIZE])
+ size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]);
+ if (size < IP_SET_LIST_MIN_SIZE)
+ size = IP_SET_LIST_MIN_SIZE;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!init_list_set(set, size, sizeof(struct set_telem),
+ ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT])))
+ return -ENOMEM;
+
+ list_set_gc_init(set);
+ } else {
+ if (!init_list_set(set, size, sizeof(struct set_elem),
+ IPSET_NO_TIMEOUT))
+ return -ENOMEM;
+ }
+ set->variant = &list_set;
+ return 0;
+}
+
+static struct ip_set_type list_set_type __read_mostly = {
+ .name = "list:set",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = list_set_create,
+ .create_policy = {
+ [IPSET_ATTR_SIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_NAME] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+list_set_init(void)
+{
+ return ip_set_type_register(&list_set_type);
+}
+
+static void __exit
+list_set_fini(void)
+{
+ ip_set_type_unregister(&list_set_type);
+}
+
+module_init(list_set_init);
+module_exit(list_set_fini);
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
new file mode 100644
index 00000000000..23f8c816221
--- /dev/null
+++ b/net/netfilter/ipset/pfxlen.c
@@ -0,0 +1,291 @@
+#include <linux/netfilter/ipset/pfxlen.h>
+
+/*
+ * Prefixlen maps for fast conversions, by Jan Engelhardt.
+ */
+
+#define E(a, b, c, d) \
+ {.ip6 = { \
+ __constant_htonl(a), __constant_htonl(b), \
+ __constant_htonl(c), __constant_htonl(d), \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_netmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_netmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_netmask_map);
+
+#undef E
+#define E(a, b, c, d) \
+ {.ip6 = { (__force __be32) a, (__force __be32) b, \
+ (__force __be32) c, (__force __be32) d, \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_hostmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_hostmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_hostmask_map);
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index a475edee091..5c48ffb60c2 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -43,11 +43,6 @@ EXPORT_SYMBOL(register_ip_vs_app);
EXPORT_SYMBOL(unregister_ip_vs_app);
EXPORT_SYMBOL(register_ip_vs_app_inc);
-/* ipvs application list head */
-static LIST_HEAD(ip_vs_app_list);
-static DEFINE_MUTEX(__ip_vs_app_mutex);
-
-
/*
* Get an ip_vs_app object
*/
@@ -67,7 +62,8 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
* Allocate/initialize app incarnation and register it in proto apps.
*/
static int
-ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
{
struct ip_vs_protocol *pp;
struct ip_vs_app *inc;
@@ -98,7 +94,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
}
}
- ret = pp->register_app(inc);
+ ret = pp->register_app(net, inc);
if (ret)
goto out;
@@ -119,7 +115,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
* Release app incarnation
*/
static void
-ip_vs_app_inc_release(struct ip_vs_app *inc)
+ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_protocol *pp;
@@ -127,7 +123,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
return;
if (pp->unregister_app)
- pp->unregister_app(inc);
+ pp->unregister_app(net, inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
pp->name, inc->name, ntohs(inc->port));
@@ -168,15 +164,17 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc)
* Register an application incarnation in protocol applications
*/
int
-register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
int result;
- mutex_lock(&__ip_vs_app_mutex);
+ mutex_lock(&ipvs->app_mutex);
- result = ip_vs_app_inc_new(app, proto, port);
+ result = ip_vs_app_inc_new(net, app, proto, port);
- mutex_unlock(&__ip_vs_app_mutex);
+ mutex_unlock(&ipvs->app_mutex);
return result;
}
@@ -185,16 +183,17 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
/*
* ip_vs_app registration routine
*/
-int register_ip_vs_app(struct ip_vs_app *app)
+int register_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
- mutex_lock(&__ip_vs_app_mutex);
+ mutex_lock(&ipvs->app_mutex);
- list_add(&app->a_list, &ip_vs_app_list);
+ list_add(&app->a_list, &ipvs->app_list);
- mutex_unlock(&__ip_vs_app_mutex);
+ mutex_unlock(&ipvs->app_mutex);
return 0;
}
@@ -204,19 +203,20 @@ int register_ip_vs_app(struct ip_vs_app *app)
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
*/
-void unregister_ip_vs_app(struct ip_vs_app *app)
+void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_app *inc, *nxt;
- mutex_lock(&__ip_vs_app_mutex);
+ mutex_lock(&ipvs->app_mutex);
list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
- ip_vs_app_inc_release(inc);
+ ip_vs_app_inc_release(net, inc);
}
list_del(&app->a_list);
- mutex_unlock(&__ip_vs_app_mutex);
+ mutex_unlock(&ipvs->app_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -226,7 +226,8 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
/*
* Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
*/
-int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+int ip_vs_bind_app(struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
{
return pp->app_conn_bind(cp);
}
@@ -481,11 +482,11 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
* /proc/net/ip_vs_app entry function
*/
-static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)
{
struct ip_vs_app *app, *inc;
- list_for_each_entry(app, &ip_vs_app_list, a_list) {
+ list_for_each_entry(app, &ipvs->app_list, a_list) {
list_for_each_entry(inc, &app->incs_list, a_list) {
if (pos-- == 0)
return inc;
@@ -497,19 +498,24 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
{
- mutex_lock(&__ip_vs_app_mutex);
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+ mutex_lock(&ipvs->app_mutex);
+
+ return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_app *inc, *app;
struct list_head *e;
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
++*pos;
if (v == SEQ_START_TOKEN)
- return ip_vs_app_idx(0);
+ return ip_vs_app_idx(ipvs, 0);
inc = v;
app = inc->app;
@@ -518,7 +524,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return list_entry(e, struct ip_vs_app, a_list);
/* go on to next application */
- for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+ for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {
app = list_entry(e, struct ip_vs_app, a_list);
list_for_each_entry(inc, &app->incs_list, a_list) {
return inc;
@@ -529,7 +535,9 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
{
- mutex_unlock(&__ip_vs_app_mutex);
+ struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq));
+
+ mutex_unlock(&ipvs->app_mutex);
}
static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
@@ -557,7 +565,8 @@ static const struct seq_operations ip_vs_app_seq_ops = {
static int ip_vs_app_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_app_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private));
}
static const struct file_operations ip_vs_app_fops = {
@@ -569,15 +578,36 @@ static const struct file_operations ip_vs_app_fops = {
};
#endif
-int __init ip_vs_app_init(void)
+static int __net_init __ip_vs_app_init(struct net *net)
{
- /* we will replace it with proc_net_ipvs_create() soon */
- proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->app_list);
+ __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key);
+ proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops);
return 0;
}
+static void __net_exit __ip_vs_app_cleanup(struct net *net)
+{
+ proc_net_remove(net, "ip_vs_app");
+}
+
+static struct pernet_operations ip_vs_app_ops = {
+ .init = __ip_vs_app_init,
+ .exit = __ip_vs_app_cleanup,
+};
+
+int __init ip_vs_app_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_app_ops);
+ return rv;
+}
+
void ip_vs_app_cleanup(void)
{
- proc_net_remove(&init_net, "ip_vs_app");
+ unregister_pernet_subsys(&ip_vs_app_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index e9adecdc8ca..83233fe24a0 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -48,35 +48,32 @@
/*
* Connection hash size. Default is what was selected at compile time.
*/
-int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
/* size and mask values */
-int ip_vs_conn_tab_size;
-int ip_vs_conn_tab_mask;
+int ip_vs_conn_tab_size __read_mostly;
+static int ip_vs_conn_tab_mask __read_mostly;
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
-static struct list_head *ip_vs_conn_tab;
+static struct list_head *ip_vs_conn_tab __read_mostly;
/* SLAB cache for IPVS connections */
static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
-/* counter for current IPVS connections */
-static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
-
/* counter for no client port connections */
static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd;
+static unsigned int ip_vs_conn_rnd __read_mostly;
/*
* Fine locking granularity for big connection hash table
*/
-#define CT_LOCKARRAY_BITS 4
+#define CT_LOCKARRAY_BITS 5
#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
@@ -133,19 +130,19 @@ static inline void ct_write_unlock_bh(unsigned key)
/*
* Returns hash value for IPVS connection entry
*/
-static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto,
const union nf_inet_addr *addr,
__be16 port)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
- return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
- (__force u32)port, proto, ip_vs_conn_rnd)
- & ip_vs_conn_tab_mask;
+ return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+ (__force u32)port, proto, ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
#endif
- return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
- ip_vs_conn_rnd)
- & ip_vs_conn_tab_mask;
+ return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+ ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
}
static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
@@ -166,18 +163,18 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
port = p->vport;
}
- return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+ return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
}
static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
- NULL, 0, &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+ &cp->caddr, cp->cport, NULL, 0, &p);
- if (cp->dest && cp->dest->svc->pe) {
- p.pe = cp->dest->svc->pe;
+ if (cp->pe) {
+ p.pe = cp->pe;
p.pe_data = cp->pe_data;
p.pe_data_len = cp->pe_data_len;
}
@@ -186,7 +183,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
}
/*
- * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
* returns bool success.
*/
static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
@@ -269,11 +266,12 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->af == p->af &&
+ p->cport == cp->cport && p->vport == cp->vport &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
- p->cport == cp->cport && p->vport == cp->vport &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
- p->protocol == cp->protocol) {
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
/* HIT */
atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
@@ -313,23 +311,23 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
struct ip_vs_conn_param *p)
{
__be16 _ports[2], *pptr;
+ struct net *net = skb_net(skb);
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
if (pptr == NULL)
return 1;
if (likely(!inverse))
- ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
- &iph->daddr, pptr[1], p);
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+ pptr[0], &iph->daddr, pptr[1], p);
else
- ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
- &iph->saddr, pptr[0], p);
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+ pptr[1], &iph->saddr, pptr[0], p);
return 0;
}
struct ip_vs_conn *
ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off, int inverse)
{
@@ -353,8 +351,10 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
ct_read_lock(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
if (p->pe_data && p->pe->ct_match) {
- if (p->pe->ct_match(p, cp))
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp))
goto out;
continue;
}
@@ -404,10 +404,11 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->af == p->af &&
+ p->vport == cp->cport && p->cport == cp->dport &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
- p->vport == cp->cport && p->cport == cp->dport &&
- p->protocol == cp->protocol) {
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
/* HIT */
atomic_inc(&cp->refcnt);
ret = cp;
@@ -428,7 +429,6 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
struct ip_vs_conn *
ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off, int inverse)
{
@@ -611,9 +611,9 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
struct ip_vs_dest *dest;
if ((cp) && (!cp->dest)) {
- dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
- &cp->vaddr, cp->vport,
- cp->protocol);
+ dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
+ cp->dport, &cp->vaddr, cp->vport,
+ cp->protocol, cp->fwmark);
ip_vs_bind_dest(cp, dest);
return dest;
} else
@@ -686,13 +686,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
int ip_vs_check_template(struct ip_vs_conn *ct)
{
struct ip_vs_dest *dest = ct->dest;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
/*
* Checking the dest server status.
*/
if ((dest == NULL) ||
!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
- (sysctl_ip_vs_expire_quiescent_template &&
+ (ipvs->sysctl_expire_quiescent_template &&
(atomic_read(&dest->weight) == 0))) {
IP_VS_DBG_BUF(9, "check_template: dest not available for "
"protocol %s s:%s:%d v:%s:%d "
@@ -730,6 +731,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
cp->timeout = 60*HZ;
@@ -765,13 +767,14 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->flags & IP_VS_CONN_F_NFCT)
ip_vs_conn_drop_conntrack(cp);
+ ip_vs_pe_put(cp->pe);
kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
- atomic_dec(&ip_vs_conn_count);
+ atomic_dec(&ipvs->conn_count);
kmem_cache_free(ip_vs_conn_cachep, cp);
return;
@@ -802,10 +805,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p,
const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
- struct ip_vs_dest *dest)
+ struct ip_vs_dest *dest, __u32 fwmark)
{
struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
+ struct netns_ipvs *ipvs = net_ipvs(p->net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+ p->protocol);
cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
@@ -815,6 +820,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
INIT_LIST_HEAD(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+ ip_vs_conn_net_set(cp, p->net);
cp->af = p->af;
cp->protocol = p->protocol;
ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
@@ -826,7 +832,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
&cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
- if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+ cp->fwmark = fwmark;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
+ ip_vs_pe_get(p->pe);
+ cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
}
@@ -842,7 +851,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
- atomic_inc(&ip_vs_conn_count);
+ atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
@@ -861,8 +870,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
#endif
ip_vs_bind_xmit(cp);
- if (unlikely(pp && atomic_read(&pp->appcnt)))
- ip_vs_bind_app(cp, pp);
+ if (unlikely(pd && atomic_read(&pd->appcnt)))
+ ip_vs_bind_app(cp, pd->pp);
/*
* Allow conntrack to be preserved. By default, conntrack
@@ -871,7 +880,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
* IP_VS_CONN_F_ONE_PACKET too.
*/
- if (ip_vs_conntrack_enabled())
+ if (ip_vs_conntrack_enabled(ipvs))
cp->flags |= IP_VS_CONN_F_NFCT;
/* Hash it in the ip_vs_conn_tab finally */
@@ -884,17 +893,22 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
* /proc/net/ip_vs_conn entries
*/
#ifdef CONFIG_PROC_FS
+struct ip_vs_iter_state {
+ struct seq_net_private p;
+ struct list_head *l;
+};
static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
{
int idx;
struct ip_vs_conn *cp;
+ struct ip_vs_iter_state *iter = seq->private;
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
ct_read_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
if (pos-- == 0) {
- seq->private = &ip_vs_conn_tab[idx];
+ iter->l = &ip_vs_conn_tab[idx];
return cp;
}
}
@@ -906,14 +920,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
{
- seq->private = NULL;
+ struct ip_vs_iter_state *iter = seq->private;
+
+ iter->l = NULL;
return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
}
static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_conn *cp = v;
- struct list_head *e, *l = seq->private;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct list_head *e, *l = iter->l;
int idx;
++*pos;
@@ -930,18 +947,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
while (++idx < ip_vs_conn_tab_size) {
ct_read_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
- seq->private = &ip_vs_conn_tab[idx];
+ iter->l = &ip_vs_conn_tab[idx];
return cp;
}
ct_read_unlock_bh(idx);
}
- seq->private = NULL;
+ iter->l = NULL;
return NULL;
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
{
- struct list_head *l = seq->private;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct list_head *l = iter->l;
if (l)
ct_read_unlock_bh(l - ip_vs_conn_tab);
@@ -955,18 +973,19 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
else {
const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
size_t len = 0;
- if (cp->dest && cp->pe_data &&
- cp->dest->svc->pe->show_pe_data) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+ if (cp->pe_data) {
pe_data[0] = ' ';
- len = strlen(cp->dest->svc->pe->name);
- memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+ len = strlen(cp->pe->name);
+ memcpy(pe_data + 1, cp->pe->name, len);
pe_data[len + 1] = ' ';
len += 2;
- len += cp->dest->svc->pe->show_pe_data(cp,
- pe_data + len);
+ len += cp->pe->show_pe_data(cp, pe_data + len);
}
pe_data[len] = '\0';
@@ -1004,7 +1023,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
static int ip_vs_conn_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_conn_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state));
}
static const struct file_operations ip_vs_conn_fops = {
@@ -1031,6 +1051,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
else {
const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -1067,7 +1091,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_conn_sync_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state));
}
static const struct file_operations ip_vs_conn_sync_fops = {
@@ -1113,7 +1138,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
}
/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(void)
+void ip_vs_random_dropentry(struct net *net)
{
int idx;
struct ip_vs_conn *cp;
@@ -1133,7 +1158,8 @@ void ip_vs_random_dropentry(void)
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
-
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
@@ -1168,12 +1194,13 @@ void ip_vs_random_dropentry(void)
/*
* Flush all the connection entries in the ip_vs_conn_tab
*/
-static void ip_vs_conn_flush(void)
+static void ip_vs_conn_flush(struct net *net)
{
int idx;
struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- flush_again:
+flush_again:
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
/*
* Lock is actually needed in this loop.
@@ -1181,7 +1208,8 @@ static void ip_vs_conn_flush(void)
ct_write_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
if (cp->control) {
@@ -1194,16 +1222,41 @@ static void ip_vs_conn_flush(void)
/* the counter may be not NULL, because maybe some conn entries
are run by slow timer handler or unhashed but still referred */
- if (atomic_read(&ip_vs_conn_count) != 0) {
+ if (atomic_read(&ipvs->conn_count) != 0) {
schedule();
goto flush_again;
}
}
+/*
+ * per netns init and exit
+ */
+int __net_init __ip_vs_conn_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ atomic_set(&ipvs->conn_count, 0);
+
+ proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+ proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+ return 0;
+}
+static void __net_exit __ip_vs_conn_cleanup(struct net *net)
+{
+ /* flush all the connection entries first */
+ ip_vs_conn_flush(net);
+ proc_net_remove(net, "ip_vs_conn");
+ proc_net_remove(net, "ip_vs_conn_sync");
+}
+static struct pernet_operations ipvs_conn_ops = {
+ .init = __ip_vs_conn_init,
+ .exit = __ip_vs_conn_cleanup,
+};
int __init ip_vs_conn_init(void)
{
int idx;
+ int retc;
/* Compute size and mask */
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
@@ -1241,24 +1294,18 @@ int __init ip_vs_conn_init(void)
rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
- proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
- proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+ retc = register_pernet_subsys(&ipvs_conn_ops);
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
- return 0;
+ return retc;
}
-
void ip_vs_conn_cleanup(void)
{
- /* flush all the connection entries first */
- ip_vs_conn_flush();
-
+ unregister_pernet_subsys(&ipvs_conn_ops);
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
- proc_net_remove(&init_net, "ip_vs_conn");
- proc_net_remove(&init_net, "ip_vs_conn_sync");
vfree(ip_vs_conn_tab);
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b4e51e9c5a0..4d06617fab6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -41,6 +41,7 @@
#include <net/icmp.h> /* for icmp_send */
#include <net/route.h>
#include <net/ip6_checksum.h>
+#include <net/netns/generic.h> /* net_generic() */
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -68,6 +69,12 @@ EXPORT_SYMBOL(ip_vs_conn_put);
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
+int ip_vs_net_id __read_mostly;
+#ifdef IP_VS_GENERIC_NETNS
+EXPORT_SYMBOL(ip_vs_net_id);
+#endif
+/* netns cnt used for uniqueness */
+static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
/* ID used in ICMP lookups */
#define icmp_id(icmph) (((icmph)->un).echo.id)
@@ -108,21 +115,28 @@ static inline void
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.ustats.inpkts++;
- dest->stats.ustats.inbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.ustats.inpkts++;
- dest->svc->stats.ustats.inbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.inpkts++;
- ip_vs_stats.ustats.inbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(dest->svc->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(ipvs->cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
}
}
@@ -131,21 +145,28 @@ static inline void
ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.ustats.outpkts++;
- dest->stats.ustats.outbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.ustats.outpkts++;
- dest->svc->stats.ustats.outbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.outpkts++;
- ip_vs_stats.ustats.outbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(dest->svc->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(ipvs->cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
}
}
@@ -153,41 +174,44 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
static inline void
ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
{
- spin_lock(&cp->dest->stats.lock);
- cp->dest->stats.ustats.conns++;
- spin_unlock(&cp->dest->stats.lock);
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(cp->dest->stats.cpustats);
+ s->ustats.conns++;
- spin_lock(&svc->stats.lock);
- svc->stats.ustats.conns++;
- spin_unlock(&svc->stats.lock);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ s->ustats.conns++;
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.conns++;
- spin_unlock(&ip_vs_stats.lock);
+ s = this_cpu_ptr(ipvs->cpustats);
+ s->ustats.conns++;
}
static inline int
ip_vs_set_state(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
- if (unlikely(!pp->state_transition))
+ if (unlikely(!pd->pp->state_transition))
return 0;
- return pp->state_transition(cp, direction, skb, pp);
+ return pd->pp->state_transition(cp, direction, skb, pd);
}
-static inline void
+static inline int
ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
struct sk_buff *skb, int protocol,
const union nf_inet_addr *caddr, __be16 cport,
const union nf_inet_addr *vaddr, __be16 vport,
struct ip_vs_conn_param *p)
{
- ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+ ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+ vport, p);
p->pe = svc->pe;
if (p->pe && p->pe->fill_param)
- p->pe->fill_param(p, skb);
+ return p->pe->fill_param(p, skb);
+
+ return 0;
}
/*
@@ -200,7 +224,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
struct sk_buff *skb,
- __be16 ports[2])
+ __be16 src_port, __be16 dst_port, int *ignored)
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
@@ -224,8 +248,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
"mnet %s\n",
- IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
- IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
+ IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
IP_VS_DBG_ADDR(svc->af, &snet));
/*
@@ -247,14 +271,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
__be16 vport = 0;
- if (ports[1] == svc->port) {
+ if (dst_port == svc->port) {
/* non-FTP template:
* <protocol, caddr, 0, vaddr, vport, daddr, dport>
* FTP template:
* <protocol, caddr, 0, vaddr, 0, daddr, 0>
*/
if (svc->port != FTPPORT)
- vport = ports[1];
+ vport = dst_port;
} else {
/* Note: persistent fwmark-based services and
* persistent port zero service are handled here.
@@ -268,24 +292,31 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
vaddr = &fwmark;
}
}
- ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
- vaddr, vport, &param);
+ /* return *ignored = -1 so NF_DROP can be used */
+ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, &param) < 0) {
+ *ignored = -1;
+ return NULL;
+ }
}
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
if (!ct || !ip_vs_check_template(ct)) {
- /* No template found or the dest of the connection
+ /*
+ * No template found or the dest of the connection
* template is not available.
+ * return *ignored=0 i.e. ICMP and NF_DROP
*/
dest = svc->scheduler->schedule(svc, skb);
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
+ *ignored = 0;
return NULL;
}
- if (ports[1] == svc->port && svc->port != FTPPORT)
+ if (dst_port == svc->port && svc->port != FTPPORT)
dport = dest->port;
/* Create a template
@@ -293,9 +324,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* and thus param.pe_data will be destroyed
* when the template expires */
ct = ip_vs_conn_new(&param, &dest->addr, dport,
- IP_VS_CONN_F_TEMPLATE, dest);
+ IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
if (ct == NULL) {
kfree(param.pe_data);
+ *ignored = -1;
return NULL;
}
@@ -306,7 +338,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
kfree(param.pe_data);
}
- dport = ports[1];
+ dport = dst_port;
if (dport == svc->port && dest->port)
dport = dest->port;
@@ -317,11 +349,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/*
* Create a new connection according to the template
*/
- ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
- &iph.daddr, ports[1], &param);
- cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
+ ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
+ src_port, &iph.daddr, dst_port, &param);
+
+ cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
if (cp == NULL) {
ip_vs_conn_put(ct);
+ *ignored = -1;
return NULL;
}
@@ -341,11 +375,27 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* It selects a server according to the virtual service, and
* creates a connection entry.
* Protocols supported: TCP, UDP
+ *
+ * Usage of *ignored
+ *
+ * 1 : protocol tried to schedule (eg. on SYN), found svc but the
+ * svc/scheduler decides that this packet should be accepted with
+ * NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 : scheduler can not find destination, so try bypass or
+ * return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 : scheduler tried to schedule but fatal error occurred, eg.
+ * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ * failure such as missing Call-ID, ENOMEM on skb_linearize
+ * or pe_data. In this case we should return NF_DROP without
+ * any attempts to send ICMP with ip_vs_leave.
*/
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_protocol *pp, int *ignored)
+ struct ip_vs_proto_data *pd, int *ignored)
{
+ struct ip_vs_protocol *pp = pd->pp;
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
@@ -371,12 +421,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
}
/*
- * Do not schedule replies from local real server. It is risky
- * for fwmark services but mostly for persistent services.
+ * Do not schedule replies from local real server.
*/
if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
- (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+ (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
"Not scheduling reply for existing connection");
__ip_vs_conn_put(cp);
@@ -386,10 +434,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
/*
* Persistent service
*/
- if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
- *ignored = 0;
- return ip_vs_sched_persist(svc, skb, pptr);
- }
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
+
+ *ignored = 0;
/*
* Non-persistent service
@@ -402,8 +450,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
return NULL;
}
- *ignored = 0;
-
dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -419,13 +465,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
- pptr[0], &iph.daddr, pptr[1], &p);
+
+ ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
+ &iph.saddr, pptr[0], &iph.daddr, pptr[1],
+ &p);
cp = ip_vs_conn_new(&p, &dest->addr,
dest->port ? dest->port : pptr[1],
- flags, dest);
- if (!cp)
+ flags, dest, skb->mark);
+ if (!cp) {
+ *ignored = -1;
return NULL;
+ }
}
IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
@@ -447,11 +497,14 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
* no destination is available for a new connection.
*/
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
+ struct net *net;
+ struct netns_ipvs *ipvs;
__be16 _ports[2], *pptr;
struct ip_vs_iphdr iph;
int unicast;
+
ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
@@ -459,18 +512,20 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ip_vs_service_put(svc);
return NF_DROP;
}
+ net = skb_net(skb);
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
else
#endif
- unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
+ unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is a non-local unicast, then create
a cache_bypass connection entry */
- if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
+ ipvs = net_ipvs(net);
+ if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
int ret, cs;
struct ip_vs_conn *cp;
unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
@@ -484,12 +539,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->af, iph.protocol,
+ ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
&iph.saddr, pptr[0],
&iph.daddr, pptr[1], &p);
cp = ip_vs_conn_new(&p, &daddr, 0,
IP_VS_CONN_F_BYPASS | flags,
- NULL);
+ NULL, skb->mark);
if (!cp)
return NF_DROP;
}
@@ -498,10 +553,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ip_vs_in_stats(cp, skb);
/* set state */
- cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+ cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
/* transmit the first SYN packet */
- ret = cp->packet_xmit(skb, cp, pp);
+ ret = cp->packet_xmit(skb, cp, pd->pp);
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
@@ -682,6 +737,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
struct ip_vs_protocol *pp,
unsigned int offset, unsigned int ihl)
{
+ struct netns_ipvs *ipvs;
unsigned int verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != 0) {
@@ -703,6 +759,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
if (!skb_make_writable(skb, offset))
goto out;
+ ipvs = net_ipvs(skb_net(skb));
+
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
ip_vs_nat_icmp_v6(skb, pp, cp, 1);
@@ -712,11 +770,11 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+ if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
goto out;
} else
#endif
- if ((sysctl_ip_vs_snat_reroute ||
+ if ((ipvs->sysctl_snat_reroute ||
skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
ip_route_me_harder(skb, RTN_LOCAL) != 0)
goto out;
@@ -808,7 +866,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
ip_vs_fill_iphdr(AF_INET, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
@@ -885,7 +943,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
@@ -924,9 +982,12 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
* Used for NAT and local client.
*/
static unsigned int
-handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
struct ip_vs_conn *cp, int ihl)
{
+ struct ip_vs_protocol *pp = pd->pp;
+ struct netns_ipvs *ipvs;
+
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
if (!skb_make_writable(skb, ihl))
@@ -961,13 +1022,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* if it came from this machine itself. So re-compute
* the routing information.
*/
+ ipvs = net_ipvs(skb_net(skb));
+
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+ if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
goto drop;
} else
#endif
- if ((sysctl_ip_vs_snat_reroute ||
+ if ((ipvs->sysctl_snat_reroute ||
skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
ip_route_me_harder(skb, RTN_LOCAL) != 0)
goto drop;
@@ -975,7 +1038,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb);
- ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
if (!(cp->flags & IP_VS_CONN_F_NFCT))
ip_vs_notrack(skb);
@@ -999,9 +1062,12 @@ drop:
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
+ struct net *net = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs;
EnterFunction(11);
@@ -1022,6 +1088,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
+ net = skb_net(skb);
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -1045,9 +1112,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
- pp = ip_vs_proto_get(iph.protocol);
- if (unlikely(!pp))
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
return NF_ACCEPT;
+ pp = pd->pp;
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
@@ -1073,11 +1141,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+ cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
+ ipvs = net_ipvs(net);
if (likely(cp))
- return handle_response(af, skb, pp, cp, iph.len);
- if (sysctl_ip_vs_nat_icmp_send &&
+ return handle_response(af, skb, pd, cp, iph.len);
+ if (ipvs->sysctl_nat_icmp_send &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
pp->protocol == IPPROTO_SCTP)) {
@@ -1087,7 +1156,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
sizeof(_ports), _ports);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(af, iph.protocol,
+ if (ip_vs_lookup_real_service(net, af, iph.protocol,
&iph.saddr,
pptr[0])) {
/*
@@ -1202,12 +1271,14 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
static int
ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
{
+ struct net *net = NULL;
struct iphdr *iph;
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
struct ip_vs_iphdr ciph;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
unsigned int offset, ihl, verdict;
union nf_inet_addr snet;
@@ -1249,9 +1320,11 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- pp = ip_vs_proto_get(cih->protocol);
- if (!pp)
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, cih->protocol);
+ if (!pd)
return NF_ACCEPT;
+ pp = pd->pp;
/* Is the embedded protocol header present? */
if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
@@ -1265,10 +1338,10 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
ip_vs_fill_iphdr(AF_INET, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1);
if (!cp) {
/* The packet could also belong to a local client */
- cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
if (cp) {
snet.ip = iph->saddr;
return handle_response_icmp(AF_INET, skb, &snet,
@@ -1312,6 +1385,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
static int
ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
{
+ struct net *net = NULL;
struct ipv6hdr *iph;
struct icmp6hdr _icmph, *ic;
struct ipv6hdr _ciph, *cih; /* The ip header contained
@@ -1319,6 +1393,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
struct ip_vs_iphdr ciph;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
unsigned int offset, verdict;
union nf_inet_addr snet;
struct rt6_info *rt;
@@ -1361,9 +1436,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- pp = ip_vs_proto_get(cih->nexthdr);
- if (!pp)
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, cih->nexthdr);
+ if (!pd)
return NF_ACCEPT;
+ pp = pd->pp;
/* Is the embedded protocol header present? */
/* TODO: we don't support fragmentation at the moment anyways */
@@ -1377,10 +1454,10 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
if (!cp) {
/* The packet could also belong to a local client */
- cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
if (cp) {
ipv6_addr_copy(&snet.in6, &iph->saddr);
return handle_response_icmp(AF_INET6, skb, &snet,
@@ -1423,10 +1500,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
+ struct net *net;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, restart, pkts;
+ struct netns_ipvs *ipvs;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -1480,20 +1560,21 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
+ net = skb_net(skb);
/* Protocol supported? */
- pp = ip_vs_proto_get(iph.protocol);
- if (unlikely(!pp))
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
return NF_ACCEPT;
-
+ pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
*/
- cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
+ cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
if (unlikely(!cp)) {
int v;
- if (!pp->conn_schedule(af, skb, pp, &v, &cp))
+ if (!pp->conn_schedule(af, skb, pd, &v, &cp))
return v;
}
@@ -1505,12 +1586,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
-
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
- if (sysctl_ip_vs_expire_nodest_conn) {
+ if (ipvs->sysctl_expire_nodest_conn) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
@@ -1521,7 +1603,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
ip_vs_in_stats(cp, skb);
- restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+ restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
if (cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp);
/* do not touch skb anymore */
@@ -1535,35 +1617,41 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
*
* Sync connection if it is about to close to
* encorage the standby servers to update the connections timeout
+ *
+ * For ONE_PKT let ip_vs_sync_conn() do the filter work.
*/
- pkts = atomic_add_return(1, &cp->in_pkts);
- if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ pkts = ipvs->sysctl_sync_threshold[0];
+ else
+ pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
cp->protocol == IPPROTO_SCTP) {
if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
- (pkts % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0])) ||
+ (pkts % ipvs->sysctl_sync_threshold[1]
+ == ipvs->sysctl_sync_threshold[0])) ||
(cp->old_state != cp->state &&
((cp->state == IP_VS_SCTP_S_CLOSED) ||
(cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
(cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
- ip_vs_sync_conn(cp);
+ ip_vs_sync_conn(net, cp);
goto out;
}
}
/* Keep this block last: TCP and others with pp->num_states <= 1 */
- else if (af == AF_INET &&
- (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+ else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
(((cp->protocol != IPPROTO_TCP ||
cp->state == IP_VS_TCP_S_ESTABLISHED) &&
- (pkts % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0])) ||
+ (pkts % ipvs->sysctl_sync_threshold[1]
+ == ipvs->sysctl_sync_threshold[0])) ||
((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
(cp->state == IP_VS_TCP_S_CLOSE) ||
(cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
(cp->state == IP_VS_TCP_S_TIME_WAIT)))))
- ip_vs_sync_conn(cp);
+ ip_vs_sync_conn(net, cp);
out:
cp->old_state = cp->state;
@@ -1782,7 +1870,39 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
},
#endif
};
+/*
+ * Initialize IP Virtual Server netns mem.
+ */
+static int __net_init __ip_vs_init(struct net *net)
+{
+ struct netns_ipvs *ipvs;
+
+ ipvs = net_generic(net, ip_vs_net_id);
+ if (ipvs == NULL) {
+ pr_err("%s(): no memory.\n", __func__);
+ return -ENOMEM;
+ }
+ ipvs->net = net;
+ /* Counters used for creating unique names */
+ ipvs->gen = atomic_read(&ipvs_netns_cnt);
+ atomic_inc(&ipvs_netns_cnt);
+ net->ipvs = ipvs;
+ printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
+ sizeof(struct netns_ipvs), ipvs->gen);
+ return 0;
+}
+static void __net_exit __ip_vs_cleanup(struct net *net)
+{
+ IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen);
+}
+
+static struct pernet_operations ipvs_core_ops = {
+ .init = __ip_vs_init,
+ .exit = __ip_vs_cleanup,
+ .id = &ip_vs_net_id,
+ .size = sizeof(struct netns_ipvs),
+};
/*
* Initialize IP Virtual Server
@@ -1791,8 +1911,11 @@ static int __init ip_vs_init(void)
{
int ret;
- ip_vs_estimator_init();
+ ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
+ if (ret < 0)
+ return ret;
+ ip_vs_estimator_init();
ret = ip_vs_control_init();
if (ret < 0) {
pr_err("can't setup control.\n");
@@ -1813,15 +1936,23 @@ static int __init ip_vs_init(void)
goto cleanup_app;
}
+ ret = ip_vs_sync_init();
+ if (ret < 0) {
+ pr_err("can't setup sync data.\n");
+ goto cleanup_conn;
+ }
+
ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
if (ret < 0) {
pr_err("can't register hooks.\n");
- goto cleanup_conn;
+ goto cleanup_sync;
}
pr_info("ipvs loaded.\n");
return ret;
+cleanup_sync:
+ ip_vs_sync_cleanup();
cleanup_conn:
ip_vs_conn_cleanup();
cleanup_app:
@@ -1831,17 +1962,20 @@ static int __init ip_vs_init(void)
ip_vs_control_cleanup();
cleanup_estimator:
ip_vs_estimator_cleanup();
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
return ret;
}
static void __exit ip_vs_cleanup(void)
{
nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ ip_vs_sync_cleanup();
ip_vs_conn_cleanup();
ip_vs_app_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
ip_vs_estimator_cleanup();
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
pr_info("ipvs unloaded.\n");
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 22f7ad5101a..c73b0c831a2 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -38,6 +38,7 @@
#include <linux/mutex.h>
#include <net/net_namespace.h>
+#include <linux/nsproxy.h>
#include <net/ip.h>
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
@@ -57,42 +58,7 @@ static DEFINE_MUTEX(__ip_vs_mutex);
/* lock for service table */
static DEFINE_RWLOCK(__ip_vs_svc_lock);
-/* lock for table with the real services */
-static DEFINE_RWLOCK(__ip_vs_rs_lock);
-
-/* lock for state and timeout tables */
-static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
-
-/* lock for drop entry handling */
-static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
-
-/* lock for drop packet handling */
-static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
-
-/* 1/rate drop and drop-entry variables */
-int ip_vs_drop_rate = 0;
-int ip_vs_drop_counter = 0;
-static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
-
-/* number of virtual services */
-static int ip_vs_num_services = 0;
-
/* sysctl variables */
-static int sysctl_ip_vs_drop_entry = 0;
-static int sysctl_ip_vs_drop_packet = 0;
-static int sysctl_ip_vs_secure_tcp = 0;
-static int sysctl_ip_vs_amemthresh = 1024;
-static int sysctl_ip_vs_am_droprate = 10;
-int sysctl_ip_vs_cache_bypass = 0;
-int sysctl_ip_vs_expire_nodest_conn = 0;
-int sysctl_ip_vs_expire_quiescent_template = 0;
-int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
-int sysctl_ip_vs_nat_icmp_send = 0;
-#ifdef CONFIG_IP_VS_NFCT
-int sysctl_ip_vs_conntrack;
-#endif
-int sysctl_ip_vs_snat_reroute = 1;
-
#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;
@@ -105,7 +71,8 @@ int ip_vs_get_debug_level(void)
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
+static int __ip_vs_addr_is_local_v6(struct net *net,
+ const struct in6_addr *addr)
{
struct rt6_info *rt;
struct flowi fl = {
@@ -114,7 +81,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
.fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
};
- rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+ rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl);
if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
return 1;
@@ -125,7 +92,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
* update_defense_level is called from keventd and from sysctl,
* so it needs to protect itself from softirqs
*/
-static void update_defense_level(void)
+static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
static int old_secure_tcp = 0;
@@ -141,73 +108,73 @@ static void update_defense_level(void)
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
- nomem = (availmem < sysctl_ip_vs_amemthresh);
+ nomem = (availmem < ipvs->sysctl_amemthresh);
local_bh_disable();
/* drop_entry */
- spin_lock(&__ip_vs_dropentry_lock);
- switch (sysctl_ip_vs_drop_entry) {
+ spin_lock(&ipvs->dropentry_lock);
+ switch (ipvs->sysctl_drop_entry) {
case 0:
- atomic_set(&ip_vs_dropentry, 0);
+ atomic_set(&ipvs->dropentry, 0);
break;
case 1:
if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
- sysctl_ip_vs_drop_entry = 2;
+ atomic_set(&ipvs->dropentry, 1);
+ ipvs->sysctl_drop_entry = 2;
} else {
- atomic_set(&ip_vs_dropentry, 0);
+ atomic_set(&ipvs->dropentry, 0);
}
break;
case 2:
if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
+ atomic_set(&ipvs->dropentry, 1);
} else {
- atomic_set(&ip_vs_dropentry, 0);
- sysctl_ip_vs_drop_entry = 1;
+ atomic_set(&ipvs->dropentry, 0);
+ ipvs->sysctl_drop_entry = 1;
};
break;
case 3:
- atomic_set(&ip_vs_dropentry, 1);
+ atomic_set(&ipvs->dropentry, 1);
break;
}
- spin_unlock(&__ip_vs_dropentry_lock);
+ spin_unlock(&ipvs->dropentry_lock);
/* drop_packet */
- spin_lock(&__ip_vs_droppacket_lock);
- switch (sysctl_ip_vs_drop_packet) {
+ spin_lock(&ipvs->droppacket_lock);
+ switch (ipvs->sysctl_drop_packet) {
case 0:
- ip_vs_drop_rate = 0;
+ ipvs->drop_rate = 0;
break;
case 1:
if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
- sysctl_ip_vs_drop_packet = 2;
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ ipvs->sysctl_drop_packet = 2;
} else {
- ip_vs_drop_rate = 0;
+ ipvs->drop_rate = 0;
}
break;
case 2:
if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
} else {
- ip_vs_drop_rate = 0;
- sysctl_ip_vs_drop_packet = 1;
+ ipvs->drop_rate = 0;
+ ipvs->sysctl_drop_packet = 1;
}
break;
case 3:
- ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+ ipvs->drop_rate = ipvs->sysctl_am_droprate;
break;
}
- spin_unlock(&__ip_vs_droppacket_lock);
+ spin_unlock(&ipvs->droppacket_lock);
/* secure_tcp */
- spin_lock(&ip_vs_securetcp_lock);
- switch (sysctl_ip_vs_secure_tcp) {
+ spin_lock(&ipvs->securetcp_lock);
+ switch (ipvs->sysctl_secure_tcp) {
case 0:
if (old_secure_tcp >= 2)
to_change = 0;
@@ -216,7 +183,7 @@ static void update_defense_level(void)
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
- sysctl_ip_vs_secure_tcp = 2;
+ ipvs->sysctl_secure_tcp = 2;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
@@ -229,7 +196,7 @@ static void update_defense_level(void)
} else {
if (old_secure_tcp >= 2)
to_change = 0;
- sysctl_ip_vs_secure_tcp = 1;
+ ipvs->sysctl_secure_tcp = 1;
}
break;
case 3:
@@ -237,10 +204,11 @@ static void update_defense_level(void)
to_change = 1;
break;
}
- old_secure_tcp = sysctl_ip_vs_secure_tcp;
+ old_secure_tcp = ipvs->sysctl_secure_tcp;
if (to_change >= 0)
- ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
- spin_unlock(&ip_vs_securetcp_lock);
+ ip_vs_protocol_timeout_change(ipvs,
+ ipvs->sysctl_secure_tcp > 1);
+ spin_unlock(&ipvs->securetcp_lock);
local_bh_enable();
}
@@ -250,16 +218,16 @@ static void update_defense_level(void)
* Timer for checking the defense
*/
#define DEFENSE_TIMER_PERIOD 1*HZ
-static void defense_work_handler(struct work_struct *work);
-static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
static void defense_work_handler(struct work_struct *work)
{
- update_defense_level();
- if (atomic_read(&ip_vs_dropentry))
- ip_vs_random_dropentry();
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, defense_work.work);
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+ update_defense_level(ipvs);
+ if (atomic_read(&ipvs->dropentry))
+ ip_vs_random_dropentry(ipvs->net);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
}
int
@@ -287,33 +255,13 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
-/*
- * Hash table: for real service lookups
- */
-#define IP_VS_RTAB_BITS 4
-#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
-#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
-
-static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
-
-/*
- * Trash for destinations
- */
-static LIST_HEAD(ip_vs_dest_trash);
-
-/*
- * FTP & NULL virtual service counters
- */
-static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
-static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
-
/*
* Returns hash value for virtual service
*/
-static __inline__ unsigned
-ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
- __be16 port)
+static inline unsigned
+ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
+ const union nf_inet_addr *addr, __be16 port)
{
register unsigned porth = ntohs(port);
__be32 addr_fold = addr->ip;
@@ -323,6 +271,7 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
+ addr_fold ^= ((size_t)net>>8);
return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
& IP_VS_SVC_TAB_MASK;
@@ -331,13 +280,13 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
/*
* Returns hash value of fwmark for virtual service lookup
*/
-static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
{
- return fwmark & IP_VS_SVC_TAB_MASK;
+ return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
}
/*
- * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
* or in the ip_vs_svc_fwm_table by fwmark.
* Should be called with locked tables.
*/
@@ -353,16 +302,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
if (svc->fwmark == 0) {
/*
- * Hash it by <protocol,addr,port> in ip_vs_svc_table
+ * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
*/
- hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
- svc->port);
+ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+ &svc->addr, svc->port);
list_add(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
- * Hash it by fwmark in ip_vs_svc_fwm_table
+ * Hash it by fwmark in svc_fwm_table
*/
- hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
@@ -374,7 +323,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
/*
- * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ * Unhashes a service from svc_table / svc_fwm_table.
* Should be called with locked tables.
*/
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
@@ -386,10 +335,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
}
if (svc->fwmark == 0) {
- /* Remove it from the ip_vs_svc_table table */
+ /* Remove it from the svc_table table */
list_del(&svc->s_list);
} else {
- /* Remove it from the ip_vs_svc_fwm_table table */
+ /* Remove it from the svc_fwm_table table */
list_del(&svc->f_list);
}
@@ -400,23 +349,24 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
/*
- * Get service by {proto,addr,port} in the service table.
+ * Get service by {netns, proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
- __be16 vport)
+__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
{
unsigned hash;
struct ip_vs_service *svc;
/* Check for "full" addressed entries */
- hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
+ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
- && (svc->protocol == protocol)) {
+ && (svc->protocol == protocol)
+ && net_eq(svc->net, net)) {
/* HIT */
return svc;
}
@@ -430,16 +380,17 @@ __ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_svc_fwm_find(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
{
unsigned hash;
struct ip_vs_service *svc;
/* Check for fwmark addressed entries */
- hash = ip_vs_svc_fwm_hashkey(fwmark);
+ hash = ip_vs_svc_fwm_hashkey(net, fwmark);
list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
- if (svc->fwmark == fwmark && svc->af == af) {
+ if (svc->fwmark == fwmark && svc->af == af
+ && net_eq(svc->net, net)) {
/* HIT */
return svc;
}
@@ -449,42 +400,44 @@ __ip_vs_svc_fwm_find(int af, __u32 fwmark)
}
struct ip_vs_service *
-ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
+ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
read_lock(&__ip_vs_svc_lock);
/*
* Check the table hashed by fwmark first
*/
- if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
+ svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+ if (fwmark && svc)
goto out;
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, vport);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
- && atomic_read(&ip_vs_ftpsvc_counter)
+ && atomic_read(&ipvs->ftpsvc_counter)
&& (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
- && atomic_read(&ip_vs_nullsvc_counter)) {
+ && atomic_read(&ipvs->nullsvc_counter)) {
/*
* Check if the catch-all port (port zero) exists
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, 0);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
}
out:
@@ -519,6 +472,7 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port), atomic_read(&svc->usecnt));
+ free_percpu(svc->stats.cpustats);
kfree(svc);
}
}
@@ -545,10 +499,10 @@ static inline unsigned ip_vs_rs_hashkey(int af,
}
/*
- * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
* should be called with locked tables.
*/
-static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned hash;
@@ -562,19 +516,19 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
- list_add(&dest->d_list, &ip_vs_rtable[hash]);
+ list_add(&dest->d_list, &ipvs->rs_table[hash]);
return 1;
}
/*
- * UNhashes ip_vs_dest from ip_vs_rtable.
+ * UNhashes ip_vs_dest from rs_table.
* should be called with locked tables.
*/
static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
- * Remove it from the ip_vs_rtable table.
+ * Remove it from the rs_table table.
*/
if (!list_empty(&dest->d_list)) {
list_del(&dest->d_list);
@@ -588,10 +542,11 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
* Lookup real service by <proto,addr,port> in the real service table.
*/
struct ip_vs_dest *
-ip_vs_lookup_real_service(int af, __u16 protocol,
+ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
const union nf_inet_addr *daddr,
__be16 dport)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
unsigned hash;
struct ip_vs_dest *dest;
@@ -601,19 +556,19 @@ ip_vs_lookup_real_service(int af, __u16 protocol,
*/
hash = ip_vs_rs_hashkey(af, daddr, dport);
- read_lock(&__ip_vs_rs_lock);
- list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+ read_lock(&ipvs->rs_lock);
+ list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
if ((dest->af == af)
&& ip_vs_addr_equal(af, &dest->addr, daddr)
&& (dest->port == dport)
&& ((dest->protocol == protocol) ||
dest->vfwmark)) {
/* HIT */
- read_unlock(&__ip_vs_rs_lock);
+ read_unlock(&ipvs->rs_lock);
return dest;
}
}
- read_unlock(&__ip_vs_rs_lock);
+ read_unlock(&ipvs->rs_lock);
return NULL;
}
@@ -652,15 +607,16 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* ip_vs_lookup_real_service() looked promissing, but
* seems not working as expected.
*/
-struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
+struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
+ const union nf_inet_addr *daddr,
__be16 dport,
const union nf_inet_addr *vaddr,
- __be16 vport, __u16 protocol)
+ __be16 vport, __u16 protocol, __u32 fwmark)
{
struct ip_vs_dest *dest;
struct ip_vs_service *svc;
- svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
+ svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
if (!svc)
return NULL;
dest = ip_vs_lookup_dest(svc, daddr, dport);
@@ -685,11 +641,12 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__be16 dport)
{
struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
/*
* Find the destination in trash
*/
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
"dest->refcnt=%d\n",
dest->vfwmark,
@@ -720,6 +677,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
list_del(&dest->n_list);
ip_vs_dst_reset(dest);
__ip_vs_unbind_svc(dest);
+ free_percpu(dest->stats.cpustats);
kfree(dest);
}
}
@@ -737,14 +695,16 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* are expired, and the refcnt of each destination in the trash must
* be 1, so we simply release them here.
*/
-static void ip_vs_trash_cleanup(void)
+static void ip_vs_trash_cleanup(struct net *net)
{
struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
list_del(&dest->n_list);
ip_vs_dst_reset(dest);
__ip_vs_unbind_svc(dest);
+ free_percpu(dest->stats.cpustats);
kfree(dest);
}
}
@@ -768,6 +728,7 @@ static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
int conn_flags;
/* set the weight and the flags */
@@ -780,12 +741,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
- * Put the real service in ip_vs_rtable if not present.
+ * Put the real service in rs_table if not present.
* For now only for NAT!
*/
- write_lock_bh(&__ip_vs_rs_lock);
- ip_vs_rs_hash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
+ write_lock_bh(&ipvs->rs_lock);
+ ip_vs_rs_hash(ipvs, dest);
+ write_unlock_bh(&ipvs->rs_lock);
}
atomic_set(&dest->conn_flags, conn_flags);
@@ -813,7 +774,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
spin_unlock(&dest->dst_lock);
if (add)
- ip_vs_new_estimator(&dest->stats);
+ ip_vs_new_estimator(svc->net, &dest->stats);
write_lock_bh(&__ip_vs_svc_lock);
@@ -850,12 +811,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
- !__ip_vs_addr_is_local_v6(&udest->addr.in6))
+ !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
return -EINVAL;
} else
#endif
{
- atype = inet_addr_type(&init_net, udest->addr.ip);
+ atype = inet_addr_type(svc->net, udest->addr.ip);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
}
@@ -865,6 +826,11 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
pr_err("%s(): no memory.\n", __func__);
return -ENOMEM;
}
+ dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!dest->stats.cpustats) {
+ pr_err("%s() alloc_percpu failed\n", __func__);
+ goto err_alloc;
+ }
dest->af = svc->af;
dest->protocol = svc->protocol;
@@ -888,6 +854,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
LeaveFunction(2);
return 0;
+
+err_alloc:
+ kfree(dest);
+ return -ENOMEM;
}
@@ -1006,16 +976,18 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete a destination (must be already unlinked from the service)
*/
-static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
{
- ip_vs_kill_estimator(&dest->stats);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_kill_estimator(net, &dest->stats);
/*
* Remove it from the d-linked list with the real services.
*/
- write_lock_bh(&__ip_vs_rs_lock);
+ write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_unhash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
+ write_unlock_bh(&ipvs->rs_lock);
/*
* Decrease the refcnt of the dest, and free the dest
@@ -1034,6 +1006,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
and only one user context can update virtual service at a
time, so the operation here is OK */
atomic_dec(&dest->svc->refcnt);
+ free_percpu(dest->stats.cpustats);
kfree(dest);
} else {
IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
@@ -1041,7 +1014,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
- list_add(&dest->n_list, &ip_vs_dest_trash);
+ list_add(&dest->n_list, &ipvs->dest_trash);
atomic_inc(&dest->refcnt);
}
}
@@ -1105,7 +1078,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete the destination
*/
- __ip_vs_del_dest(dest);
+ __ip_vs_del_dest(svc->net, dest);
LeaveFunction(2);
@@ -1117,13 +1090,14 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
* Add a service into the service hash table
*/
static int
-ip_vs_add_service(struct ip_vs_service_user_kern *u,
+ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
int ret = 0;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
+ struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
@@ -1137,7 +1111,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
@@ -1159,6 +1133,11 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
ret = -ENOMEM;
goto out_err;
}
+ svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!svc->stats.cpustats) {
+ pr_err("%s() alloc_percpu failed\n", __func__);
+ goto out_err;
+ }
/* I'm the first user of the service */
atomic_set(&svc->usecnt, 0);
@@ -1172,6 +1151,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
+ svc->net = net;
INIT_LIST_HEAD(&svc->destinations);
rwlock_init(&svc->sched_lock);
@@ -1189,15 +1169,15 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
/* Update the virtual service counters */
if (svc->port == FTPPORT)
- atomic_inc(&ip_vs_ftpsvc_counter);
+ atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
- atomic_inc(&ip_vs_nullsvc_counter);
+ atomic_inc(&ipvs->nullsvc_counter);
- ip_vs_new_estimator(&svc->stats);
+ ip_vs_new_estimator(net, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
- ip_vs_num_services++;
+ ipvs->num_services++;
/* Hash the service into the service table */
write_lock_bh(&__ip_vs_svc_lock);
@@ -1207,6 +1187,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
*svc_p = svc;
return 0;
+
out_err:
if (svc != NULL) {
ip_vs_unbind_scheduler(svc);
@@ -1215,6 +1196,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
ip_vs_app_inc_put(svc->inc);
local_bh_enable();
}
+ if (svc->stats.cpustats)
+ free_percpu(svc->stats.cpustats);
kfree(svc);
}
ip_vs_scheduler_put(sched);
@@ -1248,7 +1231,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
old_sched = sched;
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
@@ -1334,14 +1317,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
struct ip_vs_pe *old_pe;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
pr_info("%s: enter\n", __func__);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
- ip_vs_num_services--;
+ ipvs->num_services--;
- ip_vs_kill_estimator(&svc->stats);
+ ip_vs_kill_estimator(svc->net, &svc->stats);
/* Unbind scheduler */
old_sched = svc->scheduler;
@@ -1364,16 +1348,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(dest);
+ __ip_vs_del_dest(svc->net, dest);
}
/*
* Update the virtual service counters
*/
if (svc->port == FTPPORT)
- atomic_dec(&ip_vs_ftpsvc_counter);
+ atomic_dec(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
- atomic_dec(&ip_vs_nullsvc_counter);
+ atomic_dec(&ipvs->nullsvc_counter);
/*
* Free the service if nobody refers to it
@@ -1383,6 +1367,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port), atomic_read(&svc->usecnt));
+ free_percpu(svc->stats.cpustats);
kfree(svc);
}
@@ -1428,17 +1413,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Flush all the virtual services
*/
-static int ip_vs_flush(void)
+static int ip_vs_flush(struct net *net)
{
int idx;
struct ip_vs_service *svc, *nxt;
/*
- * Flush the service table hashed by <protocol,addr,port>
+ * Flush the service table hashed by <netns,protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
- ip_vs_unlink_service(svc);
+ list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
+ s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc);
}
}
@@ -1448,7 +1435,8 @@ static int ip_vs_flush(void)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry_safe(svc, nxt,
&ip_vs_svc_fwm_table[idx], f_list) {
- ip_vs_unlink_service(svc);
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc);
}
}
@@ -1472,24 +1460,26 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
return 0;
}
-static int ip_vs_zero_all(void)
+static int ip_vs_zero_all(struct net *net)
{
int idx;
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- ip_vs_zero_service(svc);
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- ip_vs_zero_service(svc);
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
}
}
- ip_vs_zero_stats(&ip_vs_stats);
+ ip_vs_zero_stats(net_ipvs(net)->tot_stats);
return 0;
}
@@ -1498,6 +1488,7 @@ static int
proc_do_defense_mode(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ struct net *net = current->nsproxy->net_ns;
int *valp = table->data;
int val = *valp;
int rc;
@@ -1508,7 +1499,7 @@ proc_do_defense_mode(ctl_table *table, int write,
/* Restore the correct value */
*valp = val;
} else {
- update_defense_level();
+ update_defense_level(net_ipvs(net));
}
}
return rc;
@@ -1534,45 +1525,54 @@ proc_do_sync_threshold(ctl_table *table, int write,
return rc;
}
+static int
+proc_do_sync_mode(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 1)) {
+ /* Restore the correct value */
+ *valp = val;
+ } else {
+ struct net *net = current->nsproxy->net_ns;
+ ip_vs_sync_switch_mode(net, val);
+ }
+ }
+ return rc;
+}
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ * Do not change order or insert new entries without
+ * align with netns init in __ip_vs_control_init()
*/
static struct ctl_table vs_vars[] = {
{
.procname = "amemthresh",
- .data = &sysctl_ip_vs_amemthresh,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#ifdef CONFIG_IP_VS_DEBUG
- {
- .procname = "debug_level",
- .data = &sysctl_ip_vs_debug_level,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
{
.procname = "am_droprate",
- .data = &sysctl_ip_vs_am_droprate,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "drop_entry",
- .data = &sysctl_ip_vs_drop_entry,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "drop_packet",
- .data = &sysctl_ip_vs_drop_packet,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
@@ -1580,7 +1580,6 @@ static struct ctl_table vs_vars[] = {
#ifdef CONFIG_IP_VS_NFCT
{
.procname = "conntrack",
- .data = &sysctl_ip_vs_conntrack,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
@@ -1588,18 +1587,62 @@ static struct ctl_table vs_vars[] = {
#endif
{
.procname = "secure_tcp",
- .data = &sysctl_ip_vs_secure_tcp,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "snat_reroute",
- .data = &sysctl_ip_vs_snat_reroute,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .procname = "sync_version",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_mode,
+ },
+ {
+ .procname = "cache_bypass",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_nodest_conn",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_quiescent_template",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_threshold",
+ .maxlen =
+ sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
+ .mode = 0644,
+ .proc_handler = proc_do_sync_threshold,
+ },
+ {
+ .procname = "nat_icmp_send",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IP_VS_DEBUG
+ {
+ .procname = "debug_level",
+ .data = &sysctl_ip_vs_debug_level,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#if 0
{
.procname = "timeout_established",
@@ -1686,41 +1729,6 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_dointvec_jiffies,
},
#endif
- {
- .procname = "cache_bypass",
- .data = &sysctl_ip_vs_cache_bypass,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "expire_nodest_conn",
- .data = &sysctl_ip_vs_expire_nodest_conn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "expire_quiescent_template",
- .data = &sysctl_ip_vs_expire_quiescent_template,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sync_threshold",
- .data = &sysctl_ip_vs_sync_threshold,
- .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
- .mode = 0644,
- .proc_handler = proc_do_sync_threshold,
- },
- {
- .procname = "nat_icmp_send",
- .data = &sysctl_ip_vs_nat_icmp_send,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
{ }
};
@@ -1732,11 +1740,10 @@ const struct ctl_path net_vs_ctl_path[] = {
};
EXPORT_SYMBOL_GPL(net_vs_ctl_path);
-static struct ctl_table_header * sysctl_header;
-
#ifdef CONFIG_PROC_FS
struct ip_vs_iter {
+ struct seq_net_private p; /* Do not move this, netns depends upon it*/
struct list_head *table;
int bucket;
};
@@ -1763,6 +1770,7 @@ static inline const char *ip_vs_fwd_name(unsigned flags)
/* Get the Nth entry in the two lists */
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
+ struct net *net = seq_file_net(seq);
struct ip_vs_iter *iter = seq->private;
int idx;
struct ip_vs_service *svc;
@@ -1770,7 +1778,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (pos-- == 0){
+ if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
return svc;
@@ -1781,7 +1789,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* keep looking in fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (pos-- == 0) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
return svc;
@@ -1935,7 +1943,7 @@ static const struct seq_operations ip_vs_info_seq_ops = {
static int ip_vs_info_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &ip_vs_info_seq_ops,
+ return seq_open_net(inode, file, &ip_vs_info_seq_ops,
sizeof(struct ip_vs_iter));
}
@@ -1949,13 +1957,11 @@ static const struct file_operations ip_vs_info_fops = {
#endif
-struct ip_vs_stats ip_vs_stats = {
- .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
-};
-
#ifdef CONFIG_PROC_FS
static int ip_vs_stats_show(struct seq_file *seq, void *v)
{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
@@ -1963,29 +1969,29 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
seq_printf(seq,
" Conns Packets Packets Bytes Bytes\n");
- spin_lock_bh(&ip_vs_stats.lock);
- seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
- ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
- (unsigned long long) ip_vs_stats.ustats.inbytes,
- (unsigned long long) ip_vs_stats.ustats.outbytes);
+ spin_lock_bh(&tot_stats->lock);
+ seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns,
+ tot_stats->ustats.inpkts, tot_stats->ustats.outpkts,
+ (unsigned long long) tot_stats->ustats.inbytes,
+ (unsigned long long) tot_stats->ustats.outbytes);
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
seq_printf(seq,"%8X %8X %8X %16X %16X\n",
- ip_vs_stats.ustats.cps,
- ip_vs_stats.ustats.inpps,
- ip_vs_stats.ustats.outpps,
- ip_vs_stats.ustats.inbps,
- ip_vs_stats.ustats.outbps);
- spin_unlock_bh(&ip_vs_stats.lock);
+ tot_stats->ustats.cps,
+ tot_stats->ustats.inpps,
+ tot_stats->ustats.outpps,
+ tot_stats->ustats.inbps,
+ tot_stats->ustats.outbps);
+ spin_unlock_bh(&tot_stats->lock);
return 0;
}
static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
{
- return single_open(file, ip_vs_stats_show, NULL);
+ return single_open_net(inode, file, ip_vs_stats_show);
}
static const struct file_operations ip_vs_stats_fops = {
@@ -1996,13 +2002,70 @@ static const struct file_operations ip_vs_stats_fops = {
.release = single_release,
};
+static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
+ int i;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ "CPU Conns Packets Packets Bytes Bytes\n");
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *u = per_cpu_ptr(net->ipvs->cpustats, i);
+ seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
+ i, u->ustats.conns, u->ustats.inpkts,
+ u->ustats.outpkts, (__u64)u->ustats.inbytes,
+ (__u64)u->ustats.outbytes);
+ }
+
+ spin_lock_bh(&tot_stats->lock);
+ seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n",
+ tot_stats->ustats.conns, tot_stats->ustats.inpkts,
+ tot_stats->ustats.outpkts,
+ (unsigned long long) tot_stats->ustats.inbytes,
+ (unsigned long long) tot_stats->ustats.outbytes);
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, " %8X %8X %8X %16X %16X\n",
+ tot_stats->ustats.cps,
+ tot_stats->ustats.inpps,
+ tot_stats->ustats.outpps,
+ tot_stats->ustats.inbps,
+ tot_stats->ustats.outbps);
+ spin_unlock_bh(&tot_stats->lock);
+
+ return 0;
+}
+
+static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_percpu_show);
+}
+
+static const struct file_operations ip_vs_stats_percpu_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_percpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
/*
* Set timeout values for tcp tcpfin udp in the timeout_table.
*/
-static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
u->tcp_timeout,
u->tcp_fin_timeout,
@@ -2010,19 +2073,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
= u->tcp_timeout * HZ;
}
if (u->tcp_fin_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
= u->tcp_fin_timeout * HZ;
}
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout) {
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd->timeout_table[IP_VS_UDP_S_NORMAL]
= u->udp_timeout * HZ;
}
#endif
@@ -2087,6 +2153,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
+ struct net *net = sock_net(sk);
int ret;
unsigned char arg[MAX_ARG_LEN];
struct ip_vs_service_user *usvc_compat;
@@ -2121,19 +2188,20 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
- ret = ip_vs_flush();
+ ret = ip_vs_flush(net);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
- ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+ ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+ ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+ dm->syncid);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = stop_sync_thread(dm->state);
+ ret = stop_sync_thread(net, dm->state);
goto out_unlock;
}
@@ -2148,7 +2216,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_ZERO) {
/* if no service address is set, zero counters in all */
if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
- ret = ip_vs_zero_all();
+ ret = ip_vs_zero_all(net);
goto out_unlock;
}
}
@@ -2165,10 +2233,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Lookup the exact service by <protocol, addr, port> or fwmark */
if (usvc.fwmark == 0)
- svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+ svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
- svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
+ svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2181,7 +2249,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (svc != NULL)
ret = -EEXIST;
else
- ret = ip_vs_add_service(&usvc, &svc);
+ ret = ip_vs_add_service(net, &usvc, &svc);
break;
case IP_VS_SO_SET_EDIT:
ret = ip_vs_edit_service(svc, &usvc);
@@ -2241,7 +2309,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
}
static inline int
-__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+__ip_vs_get_service_entries(struct net *net,
+ const struct ip_vs_get_services *get,
struct ip_vs_get_services __user *uptr)
{
int idx, count=0;
@@ -2252,7 +2321,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET)
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
@@ -2271,7 +2340,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET)
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
@@ -2291,7 +2360,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
}
static inline int
-__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
struct ip_vs_get_dests __user *uptr)
{
struct ip_vs_service *svc;
@@ -2299,9 +2368,9 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
int ret = 0;
if (get->fwmark)
- svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
else
- svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+ svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
get->port);
if (svc) {
@@ -2336,17 +2405,21 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
}
static inline void
-__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
#ifdef CONFIG_IP_VS_PROTO_TCP
- u->tcp_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
- u->tcp_fin_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+ u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
u->udp_timeout =
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+ pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
#endif
}
@@ -2375,7 +2448,10 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
unsigned char arg[128];
int ret = 0;
unsigned int copylen;
+ struct net *net = sock_net(sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ BUG_ON(!net);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -2418,7 +2494,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
info.size = ip_vs_conn_tab_size;
- info.num_services = ip_vs_num_services;
+ info.num_services = ipvs->num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
}
@@ -2437,7 +2513,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_service_entries(get, user);
+ ret = __ip_vs_get_service_entries(net, get, user);
}
break;
@@ -2450,10 +2526,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
if (entry->fwmark)
- svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
else
- svc = __ip_vs_service_find(AF_INET, entry->protocol,
- &addr, entry->port);
+ svc = __ip_vs_service_find(net, AF_INET,
+ entry->protocol, &addr,
+ entry->port);
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2476,7 +2553,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_dest_entries(get, user);
+ ret = __ip_vs_get_dest_entries(net, get, user);
}
break;
@@ -2484,7 +2561,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
if (copy_to_user(user, &t, sizeof(t)) != 0)
ret = -EFAULT;
}
@@ -2495,15 +2572,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
struct ip_vs_daemon_user d[2];
memset(&d, 0, sizeof(d));
- if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+ if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
- d[0].syncid = ip_vs_master_syncid;
+ strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ sizeof(d[0].mcast_ifn));
+ d[0].syncid = ipvs->master_syncid;
}
- if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+ if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
- d[1].syncid = ip_vs_backup_syncid;
+ strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ sizeof(d[1].mcast_ifn));
+ d[1].syncid = ipvs->backup_syncid;
}
if (copy_to_user(user, &d, sizeof(d)) != 0)
ret = -EFAULT;
@@ -2542,6 +2621,7 @@ static struct genl_family ip_vs_genl_family = {
.name = IPVS_GENL_NAME,
.version = IPVS_GENL_VERSION,
.maxattr = IPVS_CMD_MAX,
+ .netnsok = true, /* Make ipvsadm to work on netns */
};
/* Policy used for first-level command attributes */
@@ -2696,11 +2776,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
int idx = 0, i;
int start = cb->args[0];
struct ip_vs_service *svc;
+ struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
- if (++idx <= start)
+ if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2711,7 +2792,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
- if (++idx <= start)
+ if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2727,7 +2808,8 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
+static int ip_vs_genl_parse_service(struct net *net,
+ struct ip_vs_service_user_kern *usvc,
struct nlattr *nla, int full_entry,
struct ip_vs_service **ret_svc)
{
@@ -2770,9 +2852,9 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
}
if (usvc->fwmark)
- svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
else
- svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+ svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
*ret_svc = svc;
@@ -2809,13 +2891,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
return 0;
}
-static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
+static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+ struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+ ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
return ret ? ERR_PTR(ret) : svc;
}
@@ -2883,6 +2966,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+ struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
@@ -2891,7 +2975,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
goto out_err;
- svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
+
+ svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc) || svc == NULL)
goto out_err;
@@ -3005,20 +3090,23 @@ nla_put_failure:
static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ struct net *net = skb_net(skb);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
mutex_lock(&__ip_vs_mutex);
- if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
- ip_vs_master_mcast_ifn,
- ip_vs_master_syncid, cb) < 0)
+ ipvs->master_mcast_ifn,
+ ipvs->master_syncid, cb) < 0)
goto nla_put_failure;
cb->args[0] = 1;
}
- if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+ if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
- ip_vs_backup_mcast_ifn,
- ip_vs_backup_syncid, cb) < 0)
+ ipvs->backup_mcast_ifn,
+ ipvs->backup_syncid, cb) < 0)
goto nla_put_failure;
cb->args[1] = 1;
@@ -3030,31 +3118,33 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_new_daemon(struct nlattr **attrs)
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
{
if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
- return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+ return start_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
}
-static int ip_vs_genl_del_daemon(struct nlattr **attrs)
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
{
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
- return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ return stop_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
}
-static int ip_vs_genl_set_config(struct nlattr **attrs)
+static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
@@ -3066,7 +3156,7 @@ static int ip_vs_genl_set_config(struct nlattr **attrs)
if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
- return ip_vs_set_timeout(&t);
+ return ip_vs_set_timeout(net, &t);
}
static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
@@ -3076,16 +3166,20 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
int need_full_svc = 0, need_full_dest = 0;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
mutex_lock(&__ip_vs_mutex);
if (cmd == IPVS_CMD_FLUSH) {
- ret = ip_vs_flush();
+ ret = ip_vs_flush(net);
goto out;
} else if (cmd == IPVS_CMD_SET_CONFIG) {
- ret = ip_vs_genl_set_config(info->attrs);
+ ret = ip_vs_genl_set_config(net, info->attrs);
goto out;
} else if (cmd == IPVS_CMD_NEW_DAEMON ||
cmd == IPVS_CMD_DEL_DAEMON) {
@@ -3101,13 +3195,13 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
}
if (cmd == IPVS_CMD_NEW_DAEMON)
- ret = ip_vs_genl_new_daemon(daemon_attrs);
+ ret = ip_vs_genl_new_daemon(net, daemon_attrs);
else
- ret = ip_vs_genl_del_daemon(daemon_attrs);
+ ret = ip_vs_genl_del_daemon(net, daemon_attrs);
goto out;
} else if (cmd == IPVS_CMD_ZERO &&
!info->attrs[IPVS_CMD_ATTR_SERVICE]) {
- ret = ip_vs_zero_all();
+ ret = ip_vs_zero_all(net);
goto out;
}
@@ -3117,7 +3211,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
need_full_svc = 1;
- ret = ip_vs_genl_parse_service(&usvc,
+ ret = ip_vs_genl_parse_service(net, &usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
need_full_svc, &svc);
if (ret)
@@ -3147,7 +3241,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
switch (cmd) {
case IPVS_CMD_NEW_SERVICE:
if (svc == NULL)
- ret = ip_vs_add_service(&usvc, &svc);
+ ret = ip_vs_add_service(net, &usvc, &svc);
else
ret = -EEXIST;
break;
@@ -3185,7 +3279,11 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *msg;
void *reply;
int ret, cmd, reply_cmd;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3214,7 +3312,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_service *svc;
- svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
+ svc = ip_vs_genl_find_service(net,
+ info->attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc)) {
ret = PTR_ERR(svc);
goto out_err;
@@ -3234,7 +3333,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
#ifdef CONFIG_IP_VS_PROTO_TCP
NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
@@ -3380,62 +3479,173 @@ static void ip_vs_genl_unregister(void)
/* End of Generic Netlink interface definitions */
+/*
+ * per netns intit/exit func.
+ */
+int __net_init __ip_vs_control_init(struct net *net)
+{
+ int idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ctl_table *tbl;
+
+ atomic_set(&ipvs->dropentry, 0);
+ spin_lock_init(&ipvs->dropentry_lock);
+ spin_lock_init(&ipvs->droppacket_lock);
+ spin_lock_init(&ipvs->securetcp_lock);
+ ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
+
+ /* Initialize rs_table */
+ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
+ INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+
+ INIT_LIST_HEAD(&ipvs->dest_trash);
+ atomic_set(&ipvs->ftpsvc_counter, 0);
+ atomic_set(&ipvs->nullsvc_counter, 0);
+
+ /* procfs stats */
+ ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
+ if (ipvs->tot_stats == NULL) {
+ pr_err("%s(): no memory.\n", __func__);
+ return -ENOMEM;
+ }
+ ipvs->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!ipvs->cpustats) {
+ pr_err("%s() alloc_percpu failed\n", __func__);
+ goto err_alloc;
+ }
+ spin_lock_init(&ipvs->tot_stats->lock);
+
+ proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
+ proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
+ proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
+ &ip_vs_stats_percpu_fops);
+
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_dup;
+ } else
+ tbl = vs_vars;
+ /* Initialize sysctl defaults */
+ idx = 0;
+ ipvs->sysctl_amemthresh = 1024;
+ tbl[idx++].data = &ipvs->sysctl_amemthresh;
+ ipvs->sysctl_am_droprate = 10;
+ tbl[idx++].data = &ipvs->sysctl_am_droprate;
+ tbl[idx++].data = &ipvs->sysctl_drop_entry;
+ tbl[idx++].data = &ipvs->sysctl_drop_packet;
+#ifdef CONFIG_IP_VS_NFCT
+ tbl[idx++].data = &ipvs->sysctl_conntrack;
+#endif
+ tbl[idx++].data = &ipvs->sysctl_secure_tcp;
+ ipvs->sysctl_snat_reroute = 1;
+ tbl[idx++].data = &ipvs->sysctl_snat_reroute;
+ ipvs->sysctl_sync_ver = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ver;
+ tbl[idx++].data = &ipvs->sysctl_cache_bypass;
+ tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+ tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
+ ipvs->sysctl_sync_threshold[0] = 3;
+ ipvs->sysctl_sync_threshold[1] = 50;
+ tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+ tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+
+
+#ifdef CONFIG_SYSCTL
+ ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
+ tbl);
+ if (ipvs->sysctl_hdr == NULL) {
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ goto err_dup;
+ }
+#endif
+ ip_vs_new_estimator(net, ipvs->tot_stats);
+ ipvs->sysctl_tbl = tbl;
+ /* Schedule defense work */
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+ return 0;
+
+err_dup:
+ free_percpu(ipvs->cpustats);
+err_alloc:
+ kfree(ipvs->tot_stats);
+ return -ENOMEM;
+}
+
+static void __net_exit __ip_vs_control_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_trash_cleanup(net);
+ ip_vs_kill_estimator(net, ipvs->tot_stats);
+ cancel_delayed_work_sync(&ipvs->defense_work);
+ cancel_work_sync(&ipvs->defense_work.work);
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+#endif
+ proc_net_remove(net, "ip_vs_stats_percpu");
+ proc_net_remove(net, "ip_vs_stats");
+ proc_net_remove(net, "ip_vs");
+ free_percpu(ipvs->cpustats);
+ kfree(ipvs->tot_stats);
+}
+
+static struct pernet_operations ipvs_control_ops = {
+ .init = __ip_vs_control_init,
+ .exit = __ip_vs_control_cleanup,
+};
int __init ip_vs_control_init(void)
{
- int ret;
int idx;
+ int ret;
EnterFunction(2);
- /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+ /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
- for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+
+ ret = register_pernet_subsys(&ipvs_control_ops);
+ if (ret) {
+ pr_err("cannot register namespace.\n");
+ goto err;
}
- smp_wmb();
+
+ smp_wmb(); /* Do we really need it now ? */
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
- return ret;
+ goto err_net;
}
ret = ip_vs_genl_register();
if (ret) {
pr_err("cannot register Generic Netlink interface.\n");
nf_unregister_sockopt(&ip_vs_sockopts);
- return ret;
+ goto err_net;
}
- proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
- proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
-
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
-
- ip_vs_new_estimator(&ip_vs_stats);
-
- /* Hook the defense timer */
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-
LeaveFunction(2);
return 0;
+
+err_net:
+ unregister_pernet_subsys(&ipvs_control_ops);
+err:
+ return ret;
}
void ip_vs_control_cleanup(void)
{
EnterFunction(2);
- ip_vs_trash_cleanup();
- cancel_delayed_work_sync(&defense_work);
- cancel_work_sync(&defense_work.work);
- ip_vs_kill_estimator(&ip_vs_stats);
- unregister_sysctl_table(sysctl_header);
- proc_net_remove(&init_net, "ip_vs_stats");
- proc_net_remove(&init_net, "ip_vs");
+ unregister_pernet_subsys(&ipvs_control_ops);
ip_vs_genl_unregister();
nf_unregister_sockopt(&ip_vs_sockopts);
LeaveFunction(2);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index ff28801962e..f560a05c965 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -8,8 +8,12 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
- *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * Affected data: est_list and est_lock.
+ * estimation_timer() runs with timer per netns.
+ * get_stats()) do the per cpu summing.
*/
#define KMSG_COMPONENT "IPVS"
@@ -48,11 +52,42 @@
*/
-static void estimation_timer(unsigned long arg);
+/*
+ * Make a summary from each cpu
+ */
+static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
+ struct ip_vs_cpu_stats *stats)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
+ unsigned int start;
+ __u64 inbytes, outbytes;
+ if (i) {
+ sum->conns += s->ustats.conns;
+ sum->inpkts += s->ustats.inpkts;
+ sum->outpkts += s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin_bh(&s->syncp);
+ inbytes = s->ustats.inbytes;
+ outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry_bh(&s->syncp, start));
+ sum->inbytes += inbytes;
+ sum->outbytes += outbytes;
+ } else {
+ sum->conns = s->ustats.conns;
+ sum->inpkts = s->ustats.inpkts;
+ sum->outpkts = s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin_bh(&s->syncp);
+ sum->inbytes = s->ustats.inbytes;
+ sum->outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry_bh(&s->syncp, start));
+ }
+ }
+}
-static LIST_HEAD(est_list);
-static DEFINE_SPINLOCK(est_lock);
-static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
static void estimation_timer(unsigned long arg)
{
@@ -62,11 +97,16 @@ static void estimation_timer(unsigned long arg)
u32 n_inpkts, n_outpkts;
u64 n_inbytes, n_outbytes;
u32 rate;
+ struct net *net = (struct net *)arg;
+ struct netns_ipvs *ipvs;
- spin_lock(&est_lock);
- list_for_each_entry(e, &est_list, list) {
+ ipvs = net_ipvs(net);
+ ip_vs_read_cpu_stats(&ipvs->tot_stats->ustats, ipvs->cpustats);
+ spin_lock(&ipvs->est_lock);
+ list_for_each_entry(e, &ipvs->est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
+ ip_vs_read_cpu_stats(&s->ustats, s->cpustats);
spin_lock(&s->lock);
n_conns = s->ustats.conns;
n_inpkts = s->ustats.inpkts;
@@ -75,38 +115,39 @@ static void estimation_timer(unsigned long arg)
n_outbytes = s->ustats.outbytes;
/* scaled by 2^10, but divided 2 seconds */
- rate = (n_conns - e->last_conns)<<9;
+ rate = (n_conns - e->last_conns) << 9;
e->last_conns = n_conns;
- e->cps += ((long)rate - (long)e->cps)>>2;
- s->ustats.cps = (e->cps+0x1FF)>>10;
+ e->cps += ((long)rate - (long)e->cps) >> 2;
+ s->ustats.cps = (e->cps + 0x1FF) >> 10;
- rate = (n_inpkts - e->last_inpkts)<<9;
+ rate = (n_inpkts - e->last_inpkts) << 9;
e->last_inpkts = n_inpkts;
- e->inpps += ((long)rate - (long)e->inpps)>>2;
- s->ustats.inpps = (e->inpps+0x1FF)>>10;
+ e->inpps += ((long)rate - (long)e->inpps) >> 2;
+ s->ustats.inpps = (e->inpps + 0x1FF) >> 10;
- rate = (n_outpkts - e->last_outpkts)<<9;
+ rate = (n_outpkts - e->last_outpkts) << 9;
e->last_outpkts = n_outpkts;
- e->outpps += ((long)rate - (long)e->outpps)>>2;
- s->ustats.outpps = (e->outpps+0x1FF)>>10;
+ e->outpps += ((long)rate - (long)e->outpps) >> 2;
+ s->ustats.outpps = (e->outpps + 0x1FF) >> 10;
- rate = (n_inbytes - e->last_inbytes)<<4;
+ rate = (n_inbytes - e->last_inbytes) << 4;
e->last_inbytes = n_inbytes;
- e->inbps += ((long)rate - (long)e->inbps)>>2;
- s->ustats.inbps = (e->inbps+0xF)>>5;
+ e->inbps += ((long)rate - (long)e->inbps) >> 2;
+ s->ustats.inbps = (e->inbps + 0xF) >> 5;
- rate = (n_outbytes - e->last_outbytes)<<4;
+ rate = (n_outbytes - e->last_outbytes) << 4;
e->last_outbytes = n_outbytes;
- e->outbps += ((long)rate - (long)e->outbps)>>2;
- s->ustats.outbps = (e->outbps+0xF)>>5;
+ e->outbps += ((long)rate - (long)e->outbps) >> 2;
+ s->ustats.outbps = (e->outbps + 0xF) >> 5;
spin_unlock(&s->lock);
}
- spin_unlock(&est_lock);
- mod_timer(&est_timer, jiffies + 2*HZ);
+ spin_unlock(&ipvs->est_lock);
+ mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
-void ip_vs_new_estimator(struct ip_vs_stats *stats)
+void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
INIT_LIST_HEAD(&est->list);
@@ -126,18 +167,19 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats)
est->last_outbytes = stats->ustats.outbytes;
est->outbps = stats->ustats.outbps<<5;
- spin_lock_bh(&est_lock);
- list_add(&est->list, &est_list);
- spin_unlock_bh(&est_lock);
+ spin_lock_bh(&ipvs->est_lock);
+ list_add(&est->list, &ipvs->est_list);
+ spin_unlock_bh(&ipvs->est_lock);
}
-void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
- spin_lock_bh(&est_lock);
+ spin_lock_bh(&ipvs->est_lock);
list_del(&est->list);
- spin_unlock_bh(&est_lock);
+ spin_unlock_bh(&ipvs->est_lock);
}
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
@@ -157,13 +199,35 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
est->outbps = 0;
}
-int __init ip_vs_estimator_init(void)
+static int __net_init __ip_vs_estimator_init(struct net *net)
{
- mod_timer(&est_timer, jiffies + 2 * HZ);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->est_list);
+ spin_lock_init(&ipvs->est_lock);
+ setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+ mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
return 0;
}
+static void __net_exit __ip_vs_estimator_exit(struct net *net)
+{
+ del_timer_sync(&net_ipvs(net)->est_timer);
+}
+static struct pernet_operations ip_vs_app_ops = {
+ .init = __ip_vs_estimator_init,
+ .exit = __ip_vs_estimator_exit,
+};
+
+int __init ip_vs_estimator_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_app_ops);
+ return rv;
+}
+
void ip_vs_estimator_cleanup(void)
{
- del_timer_sync(&est_timer);
+ unregister_pernet_subsys(&ip_vs_app_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 75455000ad1..6b5dd6ddaae 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -157,6 +157,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
int ret = 0;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
+ struct net *net;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -197,18 +198,20 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, iph->protocol,
- &from, port, &cp->caddr, 0, &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &from, port,
+ &cp->caddr, 0, &p);
n_cp = ip_vs_conn_out_get(&p);
}
if (!n_cp) {
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+ AF_INET, IPPROTO_TCP, &cp->caddr,
0, &cp->vaddr, port, &p);
n_cp = ip_vs_conn_new(&p, &from, port,
IP_VS_CONN_F_NO_CPORT |
IP_VS_CONN_F_NFCT,
- cp->dest);
+ cp->dest, skb->mark);
if (!n_cp)
return 0;
@@ -257,8 +260,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* would be adjusted twice.
*/
+ net = skb_net(skb);
cp->app_data = NULL;
- ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_tcp_conn_listen(net, n_cp);
ip_vs_conn_put(n_cp);
return ret;
}
@@ -287,6 +291,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
union nf_inet_addr to;
__be16 port;
struct ip_vs_conn *n_cp;
+ struct net *net;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -358,14 +363,15 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
- &cp->vaddr, htons(ntohs(cp->vport)-1),
- &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &to, port, &cp->vaddr,
+ htons(ntohs(cp->vport)-1), &p);
n_cp = ip_vs_conn_in_get(&p);
if (!n_cp) {
n_cp = ip_vs_conn_new(&p, &cp->daddr,
htons(ntohs(cp->dport)-1),
- IP_VS_CONN_F_NFCT, cp->dest);
+ IP_VS_CONN_F_NFCT, cp->dest,
+ skb->mark);
if (!n_cp)
return 0;
@@ -377,7 +383,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
/*
* Move tunnel to listen state
*/
- ip_vs_tcp_conn_listen(n_cp);
+ net = skb_net(skb);
+ ip_vs_tcp_conn_listen(net, n_cp);
ip_vs_conn_put(n_cp);
return 1;
@@ -398,23 +405,22 @@ static struct ip_vs_app ip_vs_ftp = {
.pkt_in = ip_vs_ftp_in,
};
-
/*
- * ip_vs_ftp initialization
+ * per netns ip_vs_ftp initialization
*/
-static int __init ip_vs_ftp_init(void)
+static int __net_init __ip_vs_ftp_init(struct net *net)
{
int i, ret;
struct ip_vs_app *app = &ip_vs_ftp;
- ret = register_ip_vs_app(app);
+ ret = register_ip_vs_app(net, app);
if (ret)
return ret;
for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
if (!ports[i])
continue;
- ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+ ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
if (ret)
break;
pr_info("%s: loaded support on port[%d] = %d\n",
@@ -422,18 +428,39 @@ static int __init ip_vs_ftp_init(void)
}
if (ret)
- unregister_ip_vs_app(app);
+ unregister_ip_vs_app(net, app);
return ret;
}
+/*
+ * netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+ struct ip_vs_app *app = &ip_vs_ftp;
+
+ unregister_ip_vs_app(net, app);
+}
+
+static struct pernet_operations ip_vs_ftp_ops = {
+ .init = __ip_vs_ftp_init,
+ .exit = __ip_vs_ftp_exit,
+};
+int __init ip_vs_ftp_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ return rv;
+}
/*
* ip_vs_ftp finish.
*/
static void __exit ip_vs_ftp_exit(void)
{
- unregister_ip_vs_app(&ip_vs_ftp);
+ unregister_pernet_subsys(&ip_vs_ftp_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 9323f894419..00b5ffab376 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -70,7 +70,6 @@
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
/*
@@ -117,7 +116,7 @@ struct ip_vs_lblc_table {
static ctl_table vs_vars_table[] = {
{
.procname = "lblc_expiration",
- .data = &sysctl_ip_vs_lblc_expiration,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -125,8 +124,6 @@ static ctl_table vs_vars_table[] = {
{ }
};
-static struct ctl_table_header * sysctl_header;
-
static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
{
list_del(&en->list);
@@ -248,6 +245,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
struct ip_vs_lblc_entry *en, *nxt;
unsigned long now = jiffies;
int i, j;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
@@ -255,7 +253,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_before(now,
- en->lastuse + sysctl_ip_vs_lblc_expiration))
+ en->lastuse +
+ ipvs->sysctl_lblc_expiration))
continue;
ip_vs_lblc_free(en);
@@ -543,23 +542,73 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
.schedule = ip_vs_lblc_schedule,
};
+/*
+ * per netns init.
+ */
+static int __net_init __ip_vs_lblc_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblc_ctl_table == NULL)
+ return -ENOMEM;
+ } else
+ ipvs->lblc_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblc_expiration = 24*60*60*HZ;
+ ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
+
+#ifdef CONFIG_SYSCTL
+ ipvs->lblc_ctl_header =
+ register_net_sysctl_table(net, net_vs_ctl_path,
+ ipvs->lblc_ctl_table);
+ if (!ipvs->lblc_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+ return -ENOMEM;
+ }
+#endif
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblc_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(ipvs->lblc_ctl_header);
+#endif
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+}
+
+static struct pernet_operations ip_vs_lblc_ops = {
+ .init = __ip_vs_lblc_init,
+ .exit = __ip_vs_lblc_exit,
+};
static int __init ip_vs_lblc_init(void)
{
int ret;
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+ ret = register_pernet_subsys(&ip_vs_lblc_ops);
+ if (ret)
+ return ret;
+
ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
if (ret)
- unregister_sysctl_table(sysctl_header);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
return ret;
}
-
static void __exit ip_vs_lblc_cleanup(void)
{
- unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index dbeed8ea421..bfa25f1ea9e 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -70,8 +70,6 @@
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
-
/*
* for IPVS lblcr entry hash table
@@ -296,7 +294,7 @@ struct ip_vs_lblcr_table {
static ctl_table vs_vars_table[] = {
{
.procname = "lblcr_expiration",
- .data = &sysctl_ip_vs_lblcr_expiration,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -304,8 +302,6 @@ static ctl_table vs_vars_table[] = {
{ }
};
-static struct ctl_table_header * sysctl_header;
-
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
list_del(&en->list);
@@ -425,14 +421,15 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
unsigned long now = jiffies;
int i, j;
struct ip_vs_lblcr_entry *en, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
- if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
- now))
+ if (time_after(en->lastuse
+ + ipvs->sysctl_lblcr_expiration, now))
continue;
ip_vs_lblcr_free(en);
@@ -664,6 +661,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
read_lock(&svc->sched_lock);
en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
if (en) {
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
/* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
@@ -675,7 +673,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
time_after(jiffies, en->set.lastmod +
- sysctl_ip_vs_lblcr_expiration)) {
+ ipvs->sysctl_lblcr_expiration)) {
struct ip_vs_dest *m;
write_lock(&en->set.lock);
@@ -744,23 +742,73 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
.schedule = ip_vs_lblcr_schedule,
};
+/*
+ * per netns init.
+ */
+static int __net_init __ip_vs_lblcr_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblcr_ctl_table == NULL)
+ return -ENOMEM;
+ } else
+ ipvs->lblcr_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblcr_expiration = 24*60*60*HZ;
+ ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
+
+#ifdef CONFIG_SYSCTL
+ ipvs->lblcr_ctl_header =
+ register_net_sysctl_table(net, net_vs_ctl_path,
+ ipvs->lblcr_ctl_table);
+ if (!ipvs->lblcr_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+ return -ENOMEM;
+ }
+#endif
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblcr_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
+#endif
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+}
+
+static struct pernet_operations ip_vs_lblcr_ops = {
+ .init = __ip_vs_lblcr_init,
+ .exit = __ip_vs_lblcr_exit,
+};
static int __init ip_vs_lblcr_init(void)
{
int ret;
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+ ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+ if (ret)
+ return ret;
+
ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
if (ret)
- unregister_sysctl_table(sysctl_header);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
return ret;
}
-
static void __exit ip_vs_lblcr_cleanup(void)
{
- unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 4680647cd45..f454c80df0a 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -141,6 +141,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
struct nf_conntrack_tuple *orig, new_reply;
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = nf_ct_net(ct);
if (exp->tuple.src.l3num != PF_INET)
return;
@@ -155,7 +156,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
/* RS->CLIENT */
orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+ ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
&orig->src.u3, orig->src.u.tcp.port,
&orig->dst.u3, orig->dst.u.tcp.port, &p);
cp = ip_vs_conn_out_get(&p);
@@ -268,7 +269,8 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
" for conn " FMT_CONN "\n",
__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
- h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+ h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+ &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
/* Show what happens instead of calling nf_ct_kill() */
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 3414af70ee1..5cf859ccb31 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -29,12 +29,11 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc)
}
/* Get pe in the pe list by name */
-static struct ip_vs_pe *
-ip_vs_pe_getbyname(const char *pe_name)
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
{
struct ip_vs_pe *pe;
- IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+ IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
pe_name);
spin_lock_bh(&ip_vs_pe_lock);
@@ -60,28 +59,22 @@ ip_vs_pe_getbyname(const char *pe_name)
}
/* Lookup pe and try to load it if it doesn't exist */
-struct ip_vs_pe *ip_vs_pe_get(const char *name)
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
{
struct ip_vs_pe *pe;
/* Search for the pe by name */
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
/* If pe not found, load the module and search again */
if (!pe) {
request_module("ip_vs_pe_%s", name);
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
}
return pe;
}
-void ip_vs_pe_put(struct ip_vs_pe *pe)
-{
- if (pe && pe->module)
- module_put(pe->module);
-}
-
/* Register a pe in the pe list */
int register_ip_vs_pe(struct ip_vs_pe *pe)
{
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index b8b4e9620f3..0d83bc01fed 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -71,6 +71,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
struct ip_vs_iphdr iph;
unsigned int dataoff, datalen, matchoff, matchlen;
const char *dptr;
+ int retc;
ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
@@ -83,6 +84,8 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
if (dataoff >= skb->len)
return -EINVAL;
+ if ((retc=skb_linearize(skb)) < 0)
+ return retc;
dptr = skb->data + dataoff;
datalen = skb->len - dataoff;
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index c5399839087..17484a4416e 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -60,6 +60,35 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
return 0;
}
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \
+ defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \
+ defined(CONFIG_IP_VS_PROTO_ESP)
+/*
+ * register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+ struct ip_vs_proto_data *pd =
+ kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC);
+
+ if (!pd) {
+ pr_err("%s(): no memory.\n", __func__);
+ return -ENOMEM;
+ }
+ pd->pp = pp; /* For speed issues */
+ pd->next = ipvs->proto_data_table[hash];
+ ipvs->proto_data_table[hash] = pd;
+ atomic_set(&pd->appcnt, 0); /* Init app counter */
+
+ if (pp->init_netns != NULL)
+ pp->init_netns(net, pd);
+
+ return 0;
+}
+#endif
/*
* unregister an ipvs protocol
@@ -82,6 +111,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
return -ESRCH;
}
+/*
+ * unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data **pd_p;
+ unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+ pd_p = &ipvs->proto_data_table[hash];
+ for (; *pd_p; pd_p = &(*pd_p)->next) {
+ if (*pd_p == pd) {
+ *pd_p = pd->next;
+ if (pd->pp->exit_netns != NULL)
+ pd->pp->exit_netns(net, pd);
+ kfree(pd);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
/*
* get ip_vs_protocol object by its proto.
@@ -100,19 +152,44 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
}
EXPORT_SYMBOL(ip_vs_proto_get);
+/*
+ * get ip_vs_protocol object data by netns and proto
+ */
+struct ip_vs_proto_data *
+__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+{
+ struct ip_vs_proto_data *pd;
+ unsigned hash = IP_VS_PROTO_HASH(proto);
+
+ for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+ if (pd->pp->protocol == proto)
+ return pd;
+ }
+
+ return NULL;
+}
+
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ return __ipvs_proto_data_get(ipvs, proto);
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);
/*
* Propagate event for state change to all protocols
*/
-void ip_vs_protocol_timeout_change(int flags)
+void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
{
- struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
int i;
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
- for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
- if (pp->timeout_change)
- pp->timeout_change(pp, flags);
+ for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) {
+ if (pd->pp->timeout_change)
+ pd->pp->timeout_change(pd, flags);
}
}
}
@@ -236,6 +313,46 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
}
+/*
+ * per network name-space init
+ */
+static int __net_init __ip_vs_protocol_init(struct net *net)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_esp);
+#endif
+ return 0;
+}
+
+static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ /* unregister all the ipvs proto data for this netns */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pd = ipvs->proto_data_table[i]) != NULL)
+ unregister_ip_vs_proto_netns(net, pd);
+ }
+}
+
+static struct pernet_operations ipvs_proto_ops = {
+ .init = __ip_vs_protocol_init,
+ .exit = __ip_vs_protocol_cleanup,
+};
int __init ip_vs_protocol_init(void)
{
@@ -265,6 +382,7 @@ int __init ip_vs_protocol_init(void)
REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
pr_info("Registered protocols (%s)\n", &protocols[2]);
+ return register_pernet_subsys(&ipvs_proto_ops);
return 0;
}
@@ -275,6 +393,7 @@ void ip_vs_protocol_cleanup(void)
struct ip_vs_protocol *pp;
int i;
+ unregister_pernet_subsys(&ipvs_proto_ops);
/* unregister all the ipvs protocols */
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
while ((pp = ip_vs_proto_table[i]) != NULL)
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 3a0461117d3..5b8eb8b12c3 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -41,28 +41,30 @@ struct isakmp_hdr {
#define PORT_ISAKMP 500
static void
-ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
- int inverse, struct ip_vs_conn_param *p)
+ah_esp_conn_fill_param_proto(struct net *net, int af,
+ const struct ip_vs_iphdr *iph, int inverse,
+ struct ip_vs_conn_param *p)
{
if (likely(!inverse))
- ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
&iph->saddr, htons(PORT_ISAKMP),
&iph->daddr, htons(PORT_ISAKMP), p);
else
- ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
&iph->daddr, htons(PORT_ISAKMP),
&iph->saddr, htons(PORT_ISAKMP), p);
}
static struct ip_vs_conn *
-ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ah_esp_conn_in_get(int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph, unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
cp = ip_vs_conn_in_get(&p);
if (!cp) {
/*
@@ -72,7 +74,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
- pp->name,
+ ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
@@ -83,21 +85,21 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn *
ah_esp_conn_out_get(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
cp = ip_vs_conn_out_get(&p);
if (!cp) {
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
- pp->name,
+ ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
@@ -107,7 +109,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
static int
-ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
/*
@@ -117,26 +119,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
}
-static void ah_esp_init(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
-static void ah_esp_exit(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
#ifdef CONFIG_IP_VS_PROTO_AH
struct ip_vs_protocol ip_vs_protocol_ah = {
.name = "AH",
.protocol = IPPROTO_AH,
.num_states = 1,
.dont_defrag = 1,
- .init = ah_esp_init,
- .exit = ah_esp_exit,
+ .init = NULL,
+ .exit = NULL,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
@@ -149,7 +139,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
.app_conn_bind = NULL,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
- .set_state_timeout = NULL,
};
#endif
@@ -159,8 +148,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
.protocol = IPPROTO_ESP,
.num_states = 1,
.dont_defrag = 1,
- .init = ah_esp_init,
- .exit = ah_esp_exit,
+ .init = NULL,
+ .exit = NULL,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 1ea96bcd342..b027ccc49f4 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -9,9 +9,10 @@
#include <net/ip_vs.h>
static int
-sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
+ struct net *net;
struct ip_vs_service *svc;
sctp_chunkhdr_t _schunkh, *sch;
sctp_sctphdr_t *sh, _sctph;
@@ -27,13 +28,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
sizeof(_schunkh), &_schunkh);
if (sch == NULL)
return 0;
-
+ net = skb_net(skb);
if ((sch->type == SCTP_CID_INIT) &&
- (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+ (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
&iph.daddr, sh->dest))) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -46,14 +47,19 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd);
+ else {
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ }
return 0;
}
ip_vs_service_put(svc);
}
-
+ /* NF_ACCEPT */
return 1;
}
@@ -856,7 +862,7 @@ static struct ipvs_sctp_nextstate
/*
* Timeout table[state]
*/
-static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
[IP_VS_SCTP_S_NONE] = 2 * HZ,
[IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ,
[IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ,
@@ -900,20 +906,8 @@ static const char *sctp_state_name(int state)
return "?";
}
-static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags)
-{
-}
-
-static int
-sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-
-return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST,
- sctp_state_name_table, sname, to);
-}
-
static inline int
-set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
int direction, const struct sk_buff *skb)
{
sctp_chunkhdr_t _sctpch, *sch;
@@ -971,7 +965,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
IP_VS_DBG_BUF(8, "%s %s %s:%d->"
"%s:%d state: %s->%s conn->refcnt:%d\n",
- pp->name,
+ pd->pp->name,
((direction == IP_VS_DIR_OUTPUT) ?
"output " : "input "),
IP_VS_DBG_ADDR(cp->af, &cp->daddr),
@@ -995,75 +989,73 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
}
}
}
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = next_state];
+ else /* What to do ? */
+ cp->timeout = sctp_timeouts[cp->state = next_state];
- cp->timeout = pp->timeout_table[cp->state = next_state];
-
- return 1;
+ return 1;
}
static int
sctp_state_transition(struct ip_vs_conn *cp, int direction,
- const struct sk_buff *skb, struct ip_vs_protocol *pp)
+ const struct sk_buff *skb, struct ip_vs_proto_data *pd)
{
int ret = 0;
spin_lock(&cp->lock);
- ret = set_sctp_state(pp, cp, direction, skb);
+ ret = set_sctp_state(pd, cp, direction, skb);
spin_unlock(&cp->lock);
return ret;
}
-/*
- * Hash table for SCTP application incarnations
- */
-#define SCTP_APP_TAB_BITS 4
-#define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS)
-#define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1)
-
-static struct list_head sctp_apps[SCTP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(sctp_app_lock);
-
static inline __u16 sctp_app_hashkey(__be16 port)
{
return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
& SCTP_APP_TAB_MASK;
}
-static int sctp_register_app(struct ip_vs_app *inc)
+static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
hash = sctp_app_hashkey(port);
- spin_lock_bh(&sctp_app_lock);
- list_for_each_entry(i, &sctp_apps[hash], p_list) {
+ spin_lock_bh(&ipvs->sctp_app_lock);
+ list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &sctp_apps[hash]);
- atomic_inc(&ip_vs_protocol_sctp.appcnt);
+ list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&sctp_app_lock);
+ spin_unlock_bh(&ipvs->sctp_app_lock);
return ret;
}
-static void sctp_unregister_app(struct ip_vs_app *inc)
+static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&sctp_app_lock);
- atomic_dec(&ip_vs_protocol_sctp.appcnt);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ spin_lock_bh(&ipvs->sctp_app_lock);
+ atomic_dec(&pd->appcnt);
list_del(&inc->p_list);
- spin_unlock_bh(&sctp_app_lock);
+ spin_unlock_bh(&ipvs->sctp_app_lock);
}
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -1074,12 +1066,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = sctp_app_hashkey(cp->vport);
- spin_lock(&sctp_app_lock);
- list_for_each_entry(inc, &sctp_apps[hash], p_list) {
+ spin_lock(&ipvs->sctp_app_lock);
+ list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&sctp_app_lock);
+ spin_unlock(&ipvs->sctp_app_lock);
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -1095,43 +1087,50 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&sctp_app_lock);
+ spin_unlock(&ipvs->sctp_app_lock);
out:
return result;
}
-static void ip_vs_sctp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(sctp_apps);
- pp->timeout_table = sctp_timeouts;
-}
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+ spin_lock_init(&ipvs->sctp_app_lock);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
+ sizeof(sctp_timeouts));
+}
-static void ip_vs_sctp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
-
+ kfree(pd->timeout_table);
}
struct ip_vs_protocol ip_vs_protocol_sctp = {
- .name = "SCTP",
- .protocol = IPPROTO_SCTP,
- .num_states = IP_VS_SCTP_S_LAST,
- .dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_sctp_init,
- .exit = ip_vs_sctp_exit,
- .register_app = sctp_register_app,
+ .name = "SCTP",
+ .protocol = IPPROTO_SCTP,
+ .num_states = IP_VS_SCTP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_sctp_init,
+ .exit_netns = __ip_vs_sctp_exit,
+ .register_app = sctp_register_app,
.unregister_app = sctp_unregister_app,
- .conn_schedule = sctp_conn_schedule,
- .conn_in_get = ip_vs_conn_in_get_proto,
- .conn_out_get = ip_vs_conn_out_get_proto,
- .snat_handler = sctp_snat_handler,
- .dnat_handler = sctp_dnat_handler,
- .csum_check = sctp_csum_check,
- .state_name = sctp_state_name,
+ .conn_schedule = sctp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = sctp_snat_handler,
+ .dnat_handler = sctp_dnat_handler,
+ .csum_check = sctp_csum_check,
+ .state_name = sctp_state_name,
.state_transition = sctp_state_transition,
- .app_conn_bind = sctp_app_conn_bind,
- .debug_packet = ip_vs_tcpudp_debug_packet,
- .timeout_change = sctp_timeout_change,
- .set_state_timeout = sctp_set_state_timeout,
+ .app_conn_bind = sctp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
};
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index f6c5200e214..c0cc341b840 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -9,8 +9,12 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
*
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * tcp_timeouts table has copy per netns in a hash table per
+ * protocol ip_vs_proto_data and is handled by netns
*/
#define KMSG_COMPONENT "IPVS"
@@ -28,9 +32,10 @@
#include <net/ip_vs.h>
static int
-tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
+ struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
struct ip_vs_iphdr iph;
@@ -42,14 +47,14 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
*verdict = NF_DROP;
return 0;
}
-
+ net = skb_net(skb);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
if (th->syn &&
- (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
- th->dest))) {
+ (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
+ &iph.daddr, th->dest))) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -63,13 +68,19 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd);
+ else {
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ }
return 0;
}
ip_vs_service_put(svc);
}
+ /* NF_ACCEPT */
return 1;
}
@@ -338,7 +349,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = {
/*
* Timeout table[state]
*/
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = 2*HZ,
[IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
[IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
@@ -437,10 +448,7 @@ static struct tcp_states_t tcp_states_dos [] = {
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
-static struct tcp_states_t *tcp_state_table = tcp_states;
-
-
-static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
{
int on = (flags & 1); /* secure_tcp */
@@ -450,14 +458,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
** for most if not for all of the applications. Something
** like "capabilities" (flags) for each object.
*/
- tcp_state_table = (on? tcp_states_dos : tcp_states);
-}
-
-static int
-tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
- tcp_state_name_table, sname, to);
+ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
}
static inline int tcp_state_idx(struct tcphdr *th)
@@ -474,7 +475,7 @@ static inline int tcp_state_idx(struct tcphdr *th)
}
static inline void
-set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
int direction, struct tcphdr *th)
{
int state_idx;
@@ -497,7 +498,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
goto tcp_state_out;
}
- new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+ new_state =
+ pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
tcp_state_out:
if (new_state != cp->state) {
@@ -505,7 +507,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
"%s:%d state: %s->%s conn->refcnt:%d\n",
- pp->name,
+ pd->pp->name,
((state_off == TCP_DIR_OUTPUT) ?
"output " : "input "),
th->syn ? 'S' : '.',
@@ -535,17 +537,19 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
}
}
- cp->timeout = pp->timeout_table[cp->state = new_state];
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = new_state];
+ else /* What to do ? */
+ cp->timeout = tcp_timeouts[cp->state = new_state];
}
-
/*
* Handle state transitions
*/
static int
tcp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
struct tcphdr _tcph, *th;
@@ -560,23 +564,12 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
return 0;
spin_lock(&cp->lock);
- set_tcp_state(pp, cp, direction, th);
+ set_tcp_state(pd, cp, direction, th);
spin_unlock(&cp->lock);
return 1;
}
-
-/*
- * Hash table for TCP application incarnations
- */
-#define TCP_APP_TAB_BITS 4
-#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
-#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);
-
static inline __u16 tcp_app_hashkey(__be16 port)
{
return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
@@ -584,44 +577,50 @@ static inline __u16 tcp_app_hashkey(__be16 port)
}
-static int tcp_register_app(struct ip_vs_app *inc)
+static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
hash = tcp_app_hashkey(port);
- spin_lock_bh(&tcp_app_lock);
- list_for_each_entry(i, &tcp_apps[hash], p_list) {
+ spin_lock_bh(&ipvs->tcp_app_lock);
+ list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &tcp_apps[hash]);
- atomic_inc(&ip_vs_protocol_tcp.appcnt);
+ list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&tcp_app_lock);
+ spin_unlock_bh(&ipvs->tcp_app_lock);
return ret;
}
static void
-tcp_unregister_app(struct ip_vs_app *inc)
+tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&tcp_app_lock);
- atomic_dec(&ip_vs_protocol_tcp.appcnt);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ spin_lock_bh(&ipvs->tcp_app_lock);
+ atomic_dec(&pd->appcnt);
list_del(&inc->p_list);
- spin_unlock_bh(&tcp_app_lock);
+ spin_unlock_bh(&ipvs->tcp_app_lock);
}
static int
tcp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -633,12 +632,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
- spin_lock(&tcp_app_lock);
- list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+ spin_lock(&ipvs->tcp_app_lock);
+ list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&tcp_app_lock);
+ spin_unlock(&ipvs->tcp_app_lock);
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -655,7 +654,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&tcp_app_lock);
+ spin_unlock(&ipvs->tcp_app_lock);
out:
return result;
@@ -665,24 +664,35 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/*
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
*/
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
spin_lock(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
- cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+ cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+ : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
spin_unlock(&cp->lock);
}
-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(tcp_apps);
- pp->timeout_table = tcp_timeouts;
-}
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+ spin_lock_init(&ipvs->tcp_app_lock);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
+ sizeof(tcp_timeouts));
+ pd->tcp_state_table = tcp_states;
+}
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
+ kfree(pd->timeout_table);
}
@@ -691,9 +701,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.protocol = IPPROTO_TCP,
.num_states = IP_VS_TCP_S_LAST,
.dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_tcp_init,
- .exit = ip_vs_tcp_exit,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_tcp_init,
+ .exit_netns = __ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
@@ -707,5 +718,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.app_conn_bind = tcp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = tcp_timeout_change,
- .set_state_timeout = tcp_set_state_timeout,
};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 9d106a06bb0..f1282cbe6fe 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -9,7 +9,8 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
*
*/
@@ -28,9 +29,10 @@
#include <net/ip6_checksum.h>
static int
-udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
+ struct net *net;
struct ip_vs_service *svc;
struct udphdr _udph, *uh;
struct ip_vs_iphdr iph;
@@ -42,13 +44,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
*verdict = NF_DROP;
return 0;
}
-
- svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+ net = skb_net(skb);
+ svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
&iph.daddr, uh->dest);
if (svc) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -62,13 +64,19 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd);
+ else {
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ }
return 0;
}
ip_vs_service_put(svc);
}
+ /* NF_ACCEPT */
return 1;
}
@@ -338,19 +346,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
return 1;
}
-
-/*
- * Note: the caller guarantees that only one of register_app,
- * unregister_app or app_conn_bind is called each time.
- */
-
-#define UDP_APP_TAB_BITS 4
-#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
-#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
-
-static struct list_head udp_apps[UDP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(udp_app_lock);
-
static inline __u16 udp_app_hashkey(__be16 port)
{
return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
@@ -358,44 +353,50 @@ static inline __u16 udp_app_hashkey(__be16 port)
}
-static int udp_register_app(struct ip_vs_app *inc)
+static int udp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
hash = udp_app_hashkey(port);
- spin_lock_bh(&udp_app_lock);
- list_for_each_entry(i, &udp_apps[hash], p_list) {
+ spin_lock_bh(&ipvs->udp_app_lock);
+ list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &udp_apps[hash]);
- atomic_inc(&ip_vs_protocol_udp.appcnt);
+ list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&udp_app_lock);
+ spin_unlock_bh(&ipvs->udp_app_lock);
return ret;
}
static void
-udp_unregister_app(struct ip_vs_app *inc)
+udp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&udp_app_lock);
- atomic_dec(&ip_vs_protocol_udp.appcnt);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ spin_lock_bh(&ipvs->udp_app_lock);
+ atomic_dec(&pd->appcnt);
list_del(&inc->p_list);
- spin_unlock_bh(&udp_app_lock);
+ spin_unlock_bh(&ipvs->udp_app_lock);
}
static int udp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -407,12 +408,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
- spin_lock(&udp_app_lock);
- list_for_each_entry(inc, &udp_apps[hash], p_list) {
+ spin_lock(&ipvs->udp_app_lock);
+ list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&udp_app_lock);
+ spin_unlock(&ipvs->udp_app_lock);
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -429,14 +430,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&udp_app_lock);
+ spin_unlock(&ipvs->udp_app_lock);
out:
return result;
}
-static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_NORMAL] = 5*60*HZ,
[IP_VS_UDP_S_LAST] = 2*HZ,
};
@@ -446,14 +447,6 @@ static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_LAST] = "BUG!",
};
-
-static int
-udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
- udp_state_name_table, sname, to);
-}
-
static const char * udp_state_name(int state)
{
if (state >= IP_VS_UDP_S_LAST)
@@ -464,20 +457,30 @@ static const char * udp_state_name(int state)
static int
udp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
- cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+ if (unlikely(!pd)) {
+ pr_err("UDP no ns data\n");
+ return 0;
+ }
+
+ cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
return 1;
}
-static void udp_init(struct ip_vs_protocol *pp)
+static void __udp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(udp_apps);
- pp->timeout_table = udp_timeouts;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
+ spin_lock_init(&ipvs->udp_app_lock);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
+ sizeof(udp_timeouts));
}
-static void udp_exit(struct ip_vs_protocol *pp)
+static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
+ kfree(pd->timeout_table);
}
@@ -486,8 +489,10 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.protocol = IPPROTO_UDP,
.num_states = IP_VS_UDP_S_LAST,
.dont_defrag = 0,
- .init = udp_init,
- .exit = udp_exit,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __udp_init,
+ .exit_netns = __udp_exit,
.conn_schedule = udp_conn_schedule,
.conn_in_get = ip_vs_conn_in_get_proto,
.conn_out_get = ip_vs_conn_out_get_proto,
@@ -501,5 +506,4 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.app_conn_bind = udp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL,
- .set_state_timeout = udp_set_state_timeout,
};
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index ab85aedea17..d1b7298e589 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -5,6 +5,18 @@
* high-performance and highly available server based on a
* cluster of servers.
*
+ * Version 1, is capable of handling both version 0 and 1 messages.
+ * Version 0 is the plain old format.
+ * Note Version 0 receivers will just drop Ver 1 messages.
+ * Version 1 is capable of handle IPv6, Persistence data,
+ * time-outs, and firewall marks.
+ * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
+ * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
+ *
+ * Definitions Message: is a complete datagram
+ * Sync_conn: is a part of a Message
+ * Param Data is an option to a Sync_conn.
+ *
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* ip_vs_sync: sync connection info from master load balancer to backups
@@ -15,6 +27,8 @@
* Alexandre Cassen : Added SyncID support for incoming sync
* messages filtering.
* Justin Ossevoort : Fix endian problem on sync message size.
+ * Hans Schillstrom : Added Version 1: i.e. IPv6,
+ * Persistence support, fwmark and time-out.
*/
#define KMSG_COMPONENT "IPVS"
@@ -35,6 +49,8 @@
#include <linux/wait.h>
#include <linux/kernel.h>
+#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+
#include <net/ip.h>
#include <net/sock.h>
@@ -43,11 +59,13 @@
#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
#define IP_VS_SYNC_PORT 8848 /* multicast port */
+#define SYNC_PROTO_VER 1 /* Protocol version in header */
/*
* IPVS sync connection entry
+ * Version 0, i.e. original version.
*/
-struct ip_vs_sync_conn {
+struct ip_vs_sync_conn_v0 {
__u8 reserved;
/* Protocol, addresses and port numbers */
@@ -71,41 +89,159 @@ struct ip_vs_sync_conn_options {
struct ip_vs_seq out_seq; /* outgoing seq. struct */
};
+/*
+ Sync Connection format (sync_conn)
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type | Protocol | Ver. | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Flags |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | State | cport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | vport | dport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | fwmark |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | timeout (in sec.) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | ... |
+ | IP-Addresses (v4 or v6) |
+ | ... |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ Optional Parameters.
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param. Type | Param. Length | Param. data |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+ | ... |
+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | | Param Type | Param. Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param data |
+ | Last Param data should be padded for 32 bit alignment |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+/*
+ * Type 0, IPv4 sync connection format
+ */
+struct ip_vs_sync_v4 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+/*
+ * Type 2 messages IPv6
+ */
+struct ip_vs_sync_v6 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ struct in6_addr caddr; /* client address */
+ struct in6_addr vaddr; /* virtual address */
+ struct in6_addr daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+
+union ip_vs_sync_conn {
+ struct ip_vs_sync_v4 v4;
+ struct ip_vs_sync_v6 v6;
+};
+
+/* Bits in Type field in above */
+#define STYPE_INET6 0
+#define STYPE_F_INET6 (1 << STYPE_INET6)
+
+#define SVER_SHIFT 12 /* Shift to get version */
+#define SVER_MASK 0x0fff /* Mask to strip version */
+
+#define IPVS_OPT_SEQ_DATA 1
+#define IPVS_OPT_PE_DATA 2
+#define IPVS_OPT_PE_NAME 3
+#define IPVS_OPT_PARAM 7
+
+#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
+#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
+#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
+#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
+
struct ip_vs_sync_thread_data {
+ struct net *net;
struct socket *sock;
char *buf;
};
-#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
+/* Version 0 definition of packet sizes */
+#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
#define FULL_CONN_SIZE \
-(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
/*
- The master mulitcasts messages to the backup load balancers in the
- following format.
+ The master mulitcasts messages (Datagrams) to the backup load balancers
+ in the following format.
+
+ Version 1:
+ Note, first byte should be Zero, so ver 0 receivers will drop the packet.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- | Count Conns | SyncID | Size |
+ | 0 | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | Version | Reserved, set to Zero |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (1) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| . |
- | . |
+ ~ . ~
| . |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (n) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version 0 Header
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | IPVS Sync Connection (1) |
*/
#define SYNC_MESG_HEADER_LEN 4
#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-struct ip_vs_sync_mesg {
+/* Version 0 header */
+struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
__u8 syncid;
__u16 size;
@@ -113,9 +249,16 @@ struct ip_vs_sync_mesg {
/* ip_vs_sync_conn entries start here */
};
-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;
+/* Version 1 header */
+struct ip_vs_sync_mesg {
+ __u8 reserved; /* must be zero */
+ __u8 syncid;
+ __u16 size;
+ __u8 nr_conns;
+ __s8 version; /* SYNC_PROTO_VER */
+ __u16 spare;
+ /* ip_vs_sync_conn entries start here */
+};
struct ip_vs_sync_buff {
struct list_head list;
@@ -127,28 +270,6 @@ struct ip_vs_sync_buff {
unsigned char *end;
};
-
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* sync daemon tasks */
-static struct task_struct *sync_master_thread;
-static struct task_struct *sync_backup_thread;
-
/* multicast addr */
static struct sockaddr_in mcast_addr = {
.sin_family = AF_INET,
@@ -156,41 +277,71 @@ static struct sockaddr_in mcast_addr = {
.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
};
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+ ho->init_seq = get_unaligned_be32(&no->init_seq);
+ ho->delta = get_unaligned_be32(&no->delta);
+ ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
+
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+ put_unaligned_be32(ho->init_seq, &no->init_seq);
+ put_unaligned_be32(ho->delta, &no->delta);
+ put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
-static inline struct ip_vs_sync_buff *sb_dequeue(void)
+static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
{
struct ip_vs_sync_buff *sb;
- spin_lock_bh(&ip_vs_sync_lock);
- if (list_empty(&ip_vs_sync_queue)) {
+ spin_lock_bh(&ipvs->sync_lock);
+ if (list_empty(&ipvs->sync_queue)) {
sb = NULL;
} else {
- sb = list_entry(ip_vs_sync_queue.next,
+ sb = list_entry(ipvs->sync_queue.next,
struct ip_vs_sync_buff,
list);
list_del(&sb->list);
}
- spin_unlock_bh(&ip_vs_sync_lock);
+ spin_unlock_bh(&ipvs->sync_lock);
return sb;
}
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
{
struct ip_vs_sync_buff *sb;
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
- if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
kfree(sb);
return NULL;
}
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */
+ sb->mesg->version = SYNC_PROTO_VER;
+ sb->mesg->syncid = ipvs->master_syncid;
+ sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
sb->mesg->nr_conns = 0;
- sb->mesg->syncid = ip_vs_master_syncid;
- sb->mesg->size = 4;
- sb->head = (unsigned char *)sb->mesg + 4;
- sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+ sb->mesg->spare = 0;
+ sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
+ sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+
sb->firstuse = jiffies;
return sb;
}
@@ -201,14 +352,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
kfree(sb);
}
-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+static inline void sb_queue_tail(struct netns_ipvs *ipvs)
{
- spin_lock(&ip_vs_sync_lock);
- if (ip_vs_sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&sb->list, &ip_vs_sync_queue);
+ struct ip_vs_sync_buff *sb = ipvs->sync_buff;
+
+ spin_lock(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ list_add_tail(&sb->list, &ipvs->sync_queue);
else
ip_vs_sync_buff_release(sb);
- spin_unlock(&ip_vs_sync_lock);
+ spin_unlock(&ipvs->sync_lock);
}
/*
@@ -216,36 +369,101 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
* than the specified time or the specified time is zero.
*/
static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
+get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
{
struct ip_vs_sync_buff *sb;
- spin_lock_bh(&curr_sb_lock);
- if (curr_sb && (time == 0 ||
- time_before(jiffies - curr_sb->firstuse, time))) {
- sb = curr_sb;
- curr_sb = NULL;
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (ipvs->sync_buff && (time == 0 ||
+ time_before(jiffies - ipvs->sync_buff->firstuse, time))) {
+ sb = ipvs->sync_buff;
+ ipvs->sync_buff = NULL;
} else
sb = NULL;
- spin_unlock_bh(&curr_sb_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
return sb;
}
+/*
+ * Switch mode from sending version 0 or 1
+ * - must handle sync_buf
+ */
+void ip_vs_sync_switch_mode(struct net *net, int mode)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER))
+ return;
+ if (mode == ipvs->sysctl_sync_ver || !ipvs->sync_buff)
+ return;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ /* Buffer empty ? then let buf_create do the job */
+ if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) {
+ kfree(ipvs->sync_buff);
+ ipvs->sync_buff = NULL;
+ } else {
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ list_add_tail(&ipvs->sync_buff->list,
+ &ipvs->sync_queue);
+ else
+ ip_vs_sync_buff_release(ipvs->sync_buff);
+ spin_unlock_bh(&ipvs->sync_lock);
+ }
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+}
/*
+ * Create a new sync buffer for Version 0 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+ struct ip_vs_sync_mesg_v0 *mesg;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
+ mesg->nr_conns = 0;
+ mesg->syncid = ipvs->master_syncid;
+ mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
+ sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+ sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+/*
+ * Version 0 , could be switched in by sys_ctl.
* Add an ip_vs_conn information into the current sync_buff.
- * Called by ip_vs_in.
*/
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
+void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
{
- struct ip_vs_sync_mesg *m;
- struct ip_vs_sync_conn *s;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg_v0 *m;
+ struct ip_vs_sync_conn_v0 *s;
int len;
- spin_lock(&curr_sb_lock);
- if (!curr_sb) {
- if (!(curr_sb=ip_vs_sync_buff_create())) {
- spin_unlock(&curr_sb_lock);
+ if (unlikely(cp->af != AF_INET))
+ return;
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
+ spin_lock(&ipvs->sync_buff_lock);
+ if (!ipvs->sync_buff) {
+ ipvs->sync_buff =
+ ip_vs_sync_buff_create_v0(ipvs);
+ if (!ipvs->sync_buff) {
+ spin_unlock(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
@@ -253,10 +471,11 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
SIMPLE_CONN_SIZE;
- m = curr_sb->mesg;
- s = (struct ip_vs_sync_conn *)curr_sb->head;
+ m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
/* copy members */
+ s->reserved = 0;
s->protocol = cp->protocol;
s->cport = cp->cport;
s->vport = cp->vport;
@@ -274,83 +493,366 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
m->nr_conns++;
m->size += len;
- curr_sb->head += len;
+ ipvs->sync_buff->head += len;
/* check if there is a space for next one */
- if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
- sb_queue_tail(curr_sb);
- curr_sb = NULL;
+ if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
+ sb_queue_tail(ipvs);
+ ipvs->sync_buff = NULL;
}
- spin_unlock(&curr_sb_lock);
+ spin_unlock(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
if (cp->control)
- ip_vs_sync_conn(cp->control);
+ ip_vs_sync_conn(net, cp->control);
+}
+
+/*
+ * Add an ip_vs_conn information into the current sync_buff.
+ * Called by ip_vs_in.
+ * Sending Version 1 messages
+ */
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m;
+ union ip_vs_sync_conn *s;
+ __u8 *p;
+ unsigned int len, pe_name_len, pad;
+
+ /* Handle old version of the protocol */
+ if (ipvs->sysctl_sync_ver == 0) {
+ ip_vs_sync_conn_v0(net, cp);
+ return;
+ }
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ goto control;
+sloop:
+ /* Sanity checks */
+ pe_name_len = 0;
+ if (cp->pe_data_len) {
+ if (!cp->pe_data || !cp->dest) {
+ IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+ return;
+ }
+ pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+ }
+
+ spin_lock(&ipvs->sync_buff_lock);
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ len = sizeof(struct ip_vs_sync_v6);
+ else
+#endif
+ len = sizeof(struct ip_vs_sync_v4);
+
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+ len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+ if (cp->pe_data_len)
+ len += cp->pe_data_len + 2; /* + Param hdr field */
+ if (pe_name_len)
+ len += pe_name_len + 2;
+
+ /* check if there is a space for this one */
+ pad = 0;
+ if (ipvs->sync_buff) {
+ pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
+ if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
+ sb_queue_tail(ipvs);
+ ipvs->sync_buff = NULL;
+ pad = 0;
+ }
+ }
+
+ if (!ipvs->sync_buff) {
+ ipvs->sync_buff = ip_vs_sync_buff_create(ipvs);
+ if (!ipvs->sync_buff) {
+ spin_unlock(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ }
+
+ m = ipvs->sync_buff->mesg;
+ p = ipvs->sync_buff->head;
+ ipvs->sync_buff->head += pad + len;
+ m->size += pad + len;
+ /* Add ev. padding from prev. sync_conn */
+ while (pad--)
+ *(p++) = 0;
+
+ s = (union ip_vs_sync_conn *)p;
+
+ /* Set message type & copy members */
+ s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+ s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */
+ s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->v4.state = htons(cp->state);
+ s->v4.protocol = cp->protocol;
+ s->v4.cport = cp->cport;
+ s->v4.vport = cp->vport;
+ s->v4.dport = cp->dport;
+ s->v4.fwmark = htonl(cp->fwmark);
+ s->v4.timeout = htonl(cp->timeout / HZ);
+ m->nr_conns++;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6) {
+ p += sizeof(struct ip_vs_sync_v6);
+ ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6);
+ ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6);
+ ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6);
+ } else
+#endif
+ {
+ p += sizeof(struct ip_vs_sync_v4); /* options ptr */
+ s->v4.caddr = cp->caddr.ip;
+ s->v4.vaddr = cp->vaddr.ip;
+ s->v4.daddr = cp->daddr.ip;
+ }
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ *(p++) = IPVS_OPT_SEQ_DATA;
+ *(p++) = sizeof(struct ip_vs_sync_conn_options);
+ hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+ p += sizeof(struct ip_vs_seq);
+ hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+ p += sizeof(struct ip_vs_seq);
+ }
+ /* Handle pe data */
+ if (cp->pe_data_len && cp->pe_data) {
+ *(p++) = IPVS_OPT_PE_DATA;
+ *(p++) = cp->pe_data_len;
+ memcpy(p, cp->pe_data, cp->pe_data_len);
+ p += cp->pe_data_len;
+ if (pe_name_len) {
+ /* Add PE_NAME */
+ *(p++) = IPVS_OPT_PE_NAME;
+ *(p++) = pe_name_len;
+ memcpy(p, cp->pe->name, pe_name_len);
+ p += pe_name_len;
+ }
+ }
+
+ spin_unlock(&ipvs->sync_buff_lock);
+
+control:
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (!cp)
+ return;
+ /*
+ * Reduce sync rate for templates
+ * i.e only increment in_pkts for Templates.
+ */
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+ int pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if (pkts % ipvs->sysctl_sync_threshold[1] != 1)
+ return;
+ }
+ goto sloop;
}
+/*
+ * fill_param used by version 1
+ */
static inline int
-ip_vs_conn_fill_param_sync(int af, int protocol,
- const union nf_inet_addr *caddr, __be16 cport,
- const union nf_inet_addr *vaddr, __be16 vport,
- struct ip_vs_conn_param *p)
+ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+ struct ip_vs_conn_param *p,
+ __u8 *pe_data, unsigned int pe_data_len,
+ __u8 *pe_name, unsigned int pe_name_len)
{
- /* XXX: Need to take into account persistence engine */
- ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+ (const union nf_inet_addr *)&sc->v6.caddr,
+ sc->v6.cport,
+ (const union nf_inet_addr *)&sc->v6.vaddr,
+ sc->v6.vport, p);
+ else
+#endif
+ ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+ (const union nf_inet_addr *)&sc->v4.caddr,
+ sc->v4.cport,
+ (const union nf_inet_addr *)&sc->v4.vaddr,
+ sc->v4.vport, p);
+ /* Handle pe data */
+ if (pe_data_len) {
+ if (pe_name_len) {
+ char buff[IP_VS_PENAME_MAXLEN+1];
+
+ memcpy(buff, pe_name, pe_name_len);
+ buff[pe_name_len]=0;
+ p->pe = __ip_vs_pe_getbyname(buff);
+ if (!p->pe) {
+ IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+ buff);
+ return 1;
+ }
+ } else {
+ IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+ return 1;
+ }
+
+ p->pe_data = kmalloc(pe_data_len, GFP_ATOMIC);
+ if (!p->pe_data) {
+ if (p->pe->module)
+ module_put(p->pe->module);
+ return -ENOMEM;
+ }
+ memcpy(p->pe_data, pe_data, pe_data_len);
+ p->pe_data_len = pe_data_len;
+ }
return 0;
}
/*
- * Process received multicast message and create the corresponding
- * ip_vs_conn entries.
+ * Connection Add / Update.
+ * Common for version 0 and 1 reception of backup sync_conns.
+ * Param: ...
+ * timeout is in sec.
*/
-static void ip_vs_process_message(const char *buffer, const size_t buflen)
+static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+ unsigned int flags, unsigned int state,
+ unsigned int protocol, unsigned int type,
+ const union nf_inet_addr *daddr, __be16 dport,
+ unsigned long timeout, __u32 fwmark,
+ struct ip_vs_sync_conn_options *opt)
{
- struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
- struct ip_vs_sync_conn *s;
- struct ip_vs_sync_conn_options *opt;
- struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp;
struct ip_vs_dest *dest;
- struct ip_vs_conn_param param;
- char *p;
- int i;
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- if (buflen < sizeof(struct ip_vs_sync_mesg)) {
- IP_VS_ERR_RL("sync message header too short\n");
- return;
- }
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ cp = ip_vs_conn_in_get(param);
+ else
+ cp = ip_vs_ct_in_get(param);
- /* Convert size back to host byte order */
- m->size = ntohs(m->size);
+ if (cp && param->pe_data) /* Free pe_data */
+ kfree(param->pe_data);
+ if (!cp) {
+ /*
+ * Find the appropriate destination for the connection.
+ * If it is not found the connection will remain unbound
+ * but still handled.
+ */
+ dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
+ param->vport, protocol, fwmark);
- if (buflen != m->size) {
- IP_VS_ERR_RL("bogus sync message size\n");
- return;
+ /* Set the approprite ativity flag */
+ if (protocol == IPPROTO_TCP) {
+ if (state != IP_VS_TCP_S_ESTABLISHED)
+ flags |= IP_VS_CONN_F_INACTIVE;
+ else
+ flags &= ~IP_VS_CONN_F_INACTIVE;
+ } else if (protocol == IPPROTO_SCTP) {
+ if (state != IP_VS_SCTP_S_ESTABLISHED)
+ flags |= IP_VS_CONN_F_INACTIVE;
+ else
+ flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
+ if (dest)
+ atomic_dec(&dest->refcnt);
+ if (!cp) {
+ if (param->pe_data)
+ kfree(param->pe_data);
+ IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+ return;
+ }
+ } else if (!cp->dest) {
+ dest = ip_vs_try_bind_dest(cp);
+ if (dest)
+ atomic_dec(&dest->refcnt);
+ } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
+ (cp->state != state)) {
+ /* update active/inactive flag for the connection */
+ dest = cp->dest;
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state != IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state == IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
+ (cp->state != state)) {
+ dest = cp->dest;
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state != IP_VS_SCTP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
}
- /* SyncID sanity check */
- if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
- IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
- m->syncid);
- return;
+ if (opt)
+ memcpy(&cp->in_seq, opt, sizeof(*opt));
+ atomic_set(&cp->in_pkts, ipvs->sysctl_sync_threshold[0]);
+ cp->state = state;
+ cp->old_state = cp->state;
+ /*
+ * For Ver 0 messages style
+ * - Not possible to recover the right timeout for templates
+ * - can not find the right fwmark
+ * virtual service. If needed, we can do it for
+ * non-fwmark persistent services.
+ * Ver 1 messages style.
+ * - No problem.
+ */
+ if (timeout) {
+ if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+ timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+ cp->timeout = timeout*HZ;
+ } else {
+ struct ip_vs_proto_data *pd;
+
+ pd = ip_vs_proto_data_get(net, protocol);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
+ cp->timeout = pd->timeout_table[state];
+ else
+ cp->timeout = (3*60*HZ);
}
+ ip_vs_conn_put(cp);
+}
- p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+/*
+ * Process received multicast message for Version 0
+ */
+static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+ const size_t buflen)
+{
+ struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_conn_options *opt;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ char *p;
+ int i;
+
+ p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
for (i=0; i<m->nr_conns; i++) {
unsigned flags, state;
if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
- IP_VS_ERR_RL("bogus conn in sync message\n");
+ IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
return;
}
- s = (struct ip_vs_sync_conn *) p;
+ s = (struct ip_vs_sync_conn_v0 *) p;
flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
flags &= ~IP_VS_CONN_F_HASHED;
if (flags & IP_VS_CONN_F_SEQ_MASK) {
opt = (struct ip_vs_sync_conn_options *)&s[1];
p += FULL_CONN_SIZE;
if (p > buffer+buflen) {
- IP_VS_ERR_RL("bogus conn options in sync message\n");
+ IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
return;
}
} else {
@@ -362,118 +864,286 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
pp = ip_vs_proto_get(s->protocol);
if (!pp) {
- IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
s->protocol);
continue;
}
if (state >= pp->num_states) {
- IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
pp->name, state);
continue;
}
} else {
/* protocol in templates is not used for state/timeout */
- pp = NULL;
if (state > 0) {
- IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
state);
state = 0;
}
}
- {
- if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport, &param)) {
- pr_err("ip_vs_conn_fill_param_sync failed");
- return;
+ ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+ (const union nf_inet_addr *)&s->caddr,
+ s->cport,
+ (const union nf_inet_addr *)&s->vaddr,
+ s->vport, &param);
+
+ /* Send timeout as Zero */
+ ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+ (union nf_inet_addr *)&s->daddr, s->dport,
+ 0, 0, opt);
+ }
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+ __u32 *opt_flags,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_sync_conn_options *topt;
+
+ topt = (struct ip_vs_sync_conn_options *)p;
+
+ if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+ IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+ return -EINVAL;
+ }
+ if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+ IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+ return -EINVAL;
+ }
+ ntoh_seq(&topt->in_seq, &opt->in_seq);
+ ntoh_seq(&topt->out_seq, &opt->out_seq);
+ *opt_flags |= IPVS_OPT_F_SEQ_DATA;
+ return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+ __u8 **data, unsigned int maxlen,
+ __u32 *opt_flags, __u32 flag)
+{
+ if (plen > maxlen) {
+ IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+ return -EINVAL;
+ }
+ if (*opt_flags & flag) {
+ IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+ return -EINVAL;
+ }
+ *data_len = plen;
+ *data = p;
+ *opt_flags |= flag;
+ return 0;
+}
+/*
+ * Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+{
+ struct ip_vs_sync_conn_options opt;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ __u32 flags;
+ unsigned int af, state, pe_data_len=0, pe_name_len=0;
+ __u8 *pe_data=NULL, *pe_name=NULL;
+ __u32 opt_flags=0;
+ int retc=0;
+
+ s = (union ip_vs_sync_conn *) p;
+
+ if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ af = AF_INET6;
+ p += sizeof(struct ip_vs_sync_v6);
+#else
+ IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+ retc = 10;
+ goto out;
+#endif
+ } else if (!s->v4.type) {
+ af = AF_INET;
+ p += sizeof(struct ip_vs_sync_v4);
+ } else {
+ return -10;
+ }
+ if (p > msg_end)
+ return -20;
+
+ /* Process optional params check Type & Len. */
+ while (p < msg_end) {
+ int ptype;
+ int plen;
+
+ if (p+2 > msg_end)
+ return -30;
+ ptype = *(p++);
+ plen = *(p++);
+
+ if (!plen || ((p + plen) > msg_end))
+ return -40;
+ /* Handle seq option p = param data */
+ switch (ptype & ~IPVS_OPT_F_PARAM) {
+ case IPVS_OPT_SEQ_DATA:
+ if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+ return -50;
+ break;
+
+ case IPVS_OPT_PE_DATA:
+ if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+ IP_VS_PEDATA_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_DATA))
+ return -60;
+ break;
+
+ case IPVS_OPT_PE_NAME:
+ if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+ IP_VS_PENAME_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_NAME))
+ return -70;
+ break;
+
+ default:
+ /* Param data mandatory ? */
+ if (!(ptype & IPVS_OPT_F_PARAM)) {
+ IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+ ptype & ~IPVS_OPT_F_PARAM);
+ retc = 20;
+ goto out;
}
- if (!(flags & IP_VS_CONN_F_TEMPLATE))
- cp = ip_vs_conn_in_get(&param);
- else
- cp = ip_vs_ct_in_get(&param);
}
- if (!cp) {
- /*
- * Find the appropriate destination for the connection.
- * If it is not found the connection will remain unbound
- * but still handled.
- */
- dest = ip_vs_find_dest(AF_INET,
- (union nf_inet_addr *)&s->daddr,
- s->dport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport,
- s->protocol);
- /* Set the approprite ativity flag */
- if (s->protocol == IPPROTO_TCP) {
- if (state != IP_VS_TCP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- } else if (s->protocol == IPPROTO_SCTP) {
- if (state != IP_VS_SCTP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
+ p += plen; /* Next option */
+ }
+
+ /* Get flags and Mask off unsupported */
+ flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+ flags |= IP_VS_CONN_F_SYNC;
+ state = ntohs(s->v4.state);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->v4.protocol);
+ if (!pp) {
+ IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+ s->v4.protocol);
+ retc = 30;
+ goto out;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+ pp->name, state);
+ retc = 40;
+ goto out;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+ if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+ pe_data_len, pe_name, pe_name_len)) {
+ retc = 50;
+ goto out;
+ }
+ /* If only IPv4, just silent skip IPv6 */
+ if (af == AF_INET)
+ ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+ (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+ ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#ifdef CONFIG_IP_VS_IPV6
+ else
+ ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+ (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+ ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#endif
+ return 0;
+ /* Error exit */
+out:
+ IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+ return retc;
+
+}
+/*
+ * Process received multicast message and create the corresponding
+ * ip_vs_conn entries.
+ * Handles Version 0 & 1
+ */
+static void ip_vs_process_message(struct net *net, __u8 *buffer,
+ const size_t buflen)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
+ __u8 *p, *msg_end;
+ int i, nr_conns;
+
+ if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
+ IP_VS_DBG(2, "BACKUP, message header too short\n");
+ return;
+ }
+ /* Convert size back to host byte order */
+ m2->size = ntohs(m2->size);
+
+ if (buflen != m2->size) {
+ IP_VS_DBG(2, "BACKUP, bogus message size\n");
+ return;
+ }
+ /* SyncID sanity check */
+ if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+ IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+ return;
+ }
+ /* Handle version 1 message */
+ if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+ && (m2->spare == 0)) {
+
+ msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
+ nr_conns = m2->nr_conns;
+
+ for (i=0; i<nr_conns; i++) {
+ union ip_vs_sync_conn *s;
+ unsigned size;
+ int retc;
+
+ p = msg_end;
+ if (p + sizeof(s->v4) > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+ return;
}
- cp = ip_vs_conn_new(&param,
- (union nf_inet_addr *)&s->daddr,
- s->dport, flags, dest);
- if (dest)
- atomic_dec(&dest->refcnt);
- if (!cp) {
- pr_err("ip_vs_conn_new failed\n");
+ s = (union ip_vs_sync_conn *)p;
+ size = ntohs(s->v4.ver_size) & SVER_MASK;
+ msg_end = p + size;
+ /* Basic sanity checks */
+ if (msg_end > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
return;
}
- } else if (!cp->dest) {
- dest = ip_vs_try_bind_dest(cp);
- if (dest)
- atomic_dec(&dest->refcnt);
- } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
- (cp->state != state)) {
- /* update active/inactive flag for the connection */
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_TCP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags |= IP_VS_CONN_F_INACTIVE;
- } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state == IP_VS_TCP_S_ESTABLISHED)) {
- atomic_inc(&dest->activeconns);
- atomic_dec(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+ ntohs(s->v4.ver_size) >> SVER_SHIFT);
+ return;
}
- } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
- (cp->state != state)) {
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_SCTP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ /* Process a single sync_conn */
+ retc = ip_vs_proc_sync_conn(net, p, msg_end);
+ if (retc < 0) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+ retc);
+ return;
}
+ /* Make sure we have 32 bit alignment */
+ msg_end = p + ((size + 3) & ~3);
}
-
- if (opt)
- memcpy(&cp->in_seq, opt, sizeof(*opt));
- atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
- cp->state = state;
- cp->old_state = cp->state;
- /*
- * We can not recover the right timeout for templates
- * in all cases, we can not find the right fwmark
- * virtual service. If needed, we can do it for
- * non-fwmark persistent services.
- */
- if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
- cp->timeout = pp->timeout_table[state];
- else
- cp->timeout = (3*60*HZ);
- ip_vs_conn_put(cp);
+ } else {
+ /* Old type of message */
+ ip_vs_process_message_v0(net, buffer, buflen);
+ return;
}
}
@@ -511,8 +1181,10 @@ static int set_mcast_if(struct sock *sk, char *ifname)
{
struct net_device *dev;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -531,30 +1203,33 @@ static int set_mcast_if(struct sock *sk, char *ifname)
* Set the maximum length of sync message according to the
* specified interface's MTU.
*/
-static int set_sync_mesg_maxlen(int sync_state)
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct net_device *dev;
int num;
if (sync_state == IP_VS_STATE_MASTER) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+ dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+ if (!dev)
return -ENODEV;
num = (dev->mtu - sizeof(struct iphdr) -
sizeof(struct udphdr) -
SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
- sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+ ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
IP_VS_DBG(7, "setting the maximum length of sync sending "
- "message %d.\n", sync_send_mesg_maxlen);
+ "message %d.\n", ipvs->send_mesg_maxlen);
} else if (sync_state == IP_VS_STATE_BACKUP) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+ dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+ if (!dev)
return -ENODEV;
- sync_recv_mesg_maxlen = dev->mtu -
+ ipvs->recv_mesg_maxlen = dev->mtu -
sizeof(struct iphdr) - sizeof(struct udphdr);
IP_VS_DBG(7, "setting the maximum length of sync receiving "
- "message %d.\n", sync_recv_mesg_maxlen);
+ "message %d.\n", ipvs->recv_mesg_maxlen);
}
return 0;
@@ -569,6 +1244,7 @@ static int set_sync_mesg_maxlen(int sync_state)
static int
join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
{
+ struct net *net = sock_net(sk);
struct ip_mreqn mreq;
struct net_device *dev;
int ret;
@@ -576,7 +1252,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -593,11 +1270,13 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
static int bind_mcastif_addr(struct socket *sock, char *ifname)
{
+ struct net *net = sock_net(sock->sk);
struct net_device *dev;
__be32 addr;
struct sockaddr_in sin;
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -619,19 +1298,20 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
/*
* Set up sending multicast socket over UDP
*/
-static struct socket * make_send_sock(void)
+static struct socket *make_send_sock(struct net *net)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct socket *sock;
int result;
/* First create a socket */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
- result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+ result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
if (result < 0) {
pr_err("Error setting outbound mcast interface\n");
goto error;
@@ -640,7 +1320,7 @@ static struct socket * make_send_sock(void)
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);
- result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+ result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
if (result < 0) {
pr_err("Error binding address of the mcast interface\n");
goto error;
@@ -664,13 +1344,14 @@ static struct socket * make_send_sock(void)
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket * make_receive_sock(void)
+static struct socket *make_receive_sock(struct net *net)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct socket *sock;
int result;
/* First create a socket */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
@@ -689,7 +1370,7 @@ static struct socket * make_receive_sock(void)
/* join the multicast group */
result = join_mcast_group(sock->sk,
(struct in_addr *) &mcast_addr.sin_addr,
- ip_vs_backup_mcast_ifn);
+ ipvs->backup_mcast_ifn);
if (result < 0) {
pr_err("Error joining to the multicast group\n");
goto error;
@@ -760,20 +1441,21 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
struct ip_vs_sync_buff *sb;
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
"syncid = %d\n",
- ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+ ipvs->master_mcast_ifn, ipvs->master_syncid);
while (!kthread_should_stop()) {
- while ((sb = sb_dequeue())) {
+ while ((sb = sb_dequeue(ipvs))) {
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
ip_vs_sync_buff_release(sb);
}
- /* check if entries stay in curr_sb for 2 seconds */
- sb = get_curr_sync_buff(2 * HZ);
+ /* check if entries stay in ipvs->sync_buff for 2 seconds */
+ sb = get_curr_sync_buff(ipvs, 2 * HZ);
if (sb) {
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
ip_vs_sync_buff_release(sb);
@@ -783,14 +1465,13 @@ static int sync_thread_master(void *data)
}
/* clean up the sync_buff queue */
- while ((sb=sb_dequeue())) {
+ while ((sb = sb_dequeue(ipvs)))
ip_vs_sync_buff_release(sb);
- }
/* clean up the current sync_buff */
- if ((sb = get_curr_sync_buff(0))) {
+ sb = get_curr_sync_buff(ipvs, 0);
+ if (sb)
ip_vs_sync_buff_release(sb);
- }
/* release the sending multicast socket */
sock_release(tinfo->sock);
@@ -803,11 +1484,12 @@ static int sync_thread_master(void *data)
static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
"syncid = %d\n",
- ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid);
while (!kthread_should_stop()) {
wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -817,7 +1499,7 @@ static int sync_thread_backup(void *data)
/* do we have data now? */
while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
- sync_recv_mesg_maxlen);
+ ipvs->recv_mesg_maxlen);
if (len <= 0) {
pr_err("receiving message error\n");
break;
@@ -826,7 +1508,7 @@ static int sync_thread_backup(void *data)
/* disable bottom half, because it accesses the data
shared by softirq while getting/creating conns */
local_bh_disable();
- ip_vs_process_message(tinfo->buf, len);
+ ip_vs_process_message(tinfo->net, tinfo->buf, len);
local_bh_enable();
}
}
@@ -840,41 +1522,42 @@ static int sync_thread_backup(void *data)
}
-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
{
struct ip_vs_sync_thread_data *tinfo;
struct task_struct **realtask, *task;
struct socket *sock;
+ struct netns_ipvs *ipvs = net_ipvs(net);
char *name, *buf = NULL;
int (*threadfn)(void *data);
int result = -ENOMEM;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
- sizeof(struct ip_vs_sync_conn));
+ sizeof(struct ip_vs_sync_conn_v0));
if (state == IP_VS_STATE_MASTER) {
- if (sync_master_thread)
+ if (ipvs->master_thread)
return -EEXIST;
- strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
- sizeof(ip_vs_master_mcast_ifn));
- ip_vs_master_syncid = syncid;
- realtask = &sync_master_thread;
- name = "ipvs_syncmaster";
+ strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->master_mcast_ifn));
+ ipvs->master_syncid = syncid;
+ realtask = &ipvs->master_thread;
+ name = "ipvs_master:%d";
threadfn = sync_thread_master;
- sock = make_send_sock();
+ sock = make_send_sock(net);
} else if (state == IP_VS_STATE_BACKUP) {
- if (sync_backup_thread)
+ if (ipvs->backup_thread)
return -EEXIST;
- strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
- sizeof(ip_vs_backup_mcast_ifn));
- ip_vs_backup_syncid = syncid;
- realtask = &sync_backup_thread;
- name = "ipvs_syncbackup";
+ strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->backup_mcast_ifn));
+ ipvs->backup_syncid = syncid;
+ realtask = &ipvs->backup_thread;
+ name = "ipvs_backup:%d";
threadfn = sync_thread_backup;
- sock = make_receive_sock();
+ sock = make_receive_sock(net);
} else {
return -EINVAL;
}
@@ -884,9 +1567,9 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
goto out;
}
- set_sync_mesg_maxlen(state);
+ set_sync_mesg_maxlen(net, state);
if (state == IP_VS_STATE_BACKUP) {
- buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
+ buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
if (!buf)
goto outsocket;
}
@@ -895,10 +1578,11 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
if (!tinfo)
goto outbuf;
+ tinfo->net = net;
tinfo->sock = sock;
tinfo->buf = buf;
- task = kthread_run(threadfn, tinfo, name);
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen);
if (IS_ERR(task)) {
result = PTR_ERR(task);
goto outtinfo;
@@ -906,7 +1590,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
/* mark as active */
*realtask = task;
- ip_vs_sync_state |= state;
+ ipvs->sync_state |= state;
/* increase the module use count */
ip_vs_use_count_inc();
@@ -924,16 +1608,18 @@ out:
}
-int stop_sync_thread(int state)
+int stop_sync_thread(struct net *net, int state)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
if (state == IP_VS_STATE_MASTER) {
- if (!sync_master_thread)
+ if (!ipvs->master_thread)
return -ESRCH;
pr_info("stopping master sync thread %d ...\n",
- task_pid_nr(sync_master_thread));
+ task_pid_nr(ipvs->master_thread));
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
@@ -941,21 +1627,21 @@ int stop_sync_thread(int state)
* progress of stopping the master sync daemon.
*/
- spin_lock_bh(&ip_vs_sync_lock);
- ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
- spin_unlock_bh(&ip_vs_sync_lock);
- kthread_stop(sync_master_thread);
- sync_master_thread = NULL;
+ spin_lock_bh(&ipvs->sync_lock);
+ ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+ spin_unlock_bh(&ipvs->sync_lock);
+ kthread_stop(ipvs->master_thread);
+ ipvs->master_thread = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
- if (!sync_backup_thread)
+ if (!ipvs->backup_thread)
return -ESRCH;
pr_info("stopping backup sync thread %d ...\n",
- task_pid_nr(sync_backup_thread));
+ task_pid_nr(ipvs->backup_thread));
- ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
- kthread_stop(sync_backup_thread);
- sync_backup_thread = NULL;
+ ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+ kthread_stop(ipvs->backup_thread);
+ ipvs->backup_thread = NULL;
} else {
return -EINVAL;
}
@@ -965,3 +1651,42 @@ int stop_sync_thread(int state)
return 0;
}
+
+/*
+ * Initialize data struct for each netns
+ */
+static int __net_init __ip_vs_sync_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->sync_queue);
+ spin_lock_init(&ipvs->sync_lock);
+ spin_lock_init(&ipvs->sync_buff_lock);
+
+ ipvs->sync_mcast_addr.sin_family = AF_INET;
+ ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
+ ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
+ return 0;
+}
+
+static void __ip_vs_sync_cleanup(struct net *net)
+{
+ stop_sync_thread(net, IP_VS_STATE_MASTER);
+ stop_sync_thread(net, IP_VS_STATE_BACKUP);
+}
+
+static struct pernet_operations ipvs_sync_ops = {
+ .init = __ip_vs_sync_init,
+ .exit = __ip_vs_sync_cleanup,
+};
+
+
+int __init ip_vs_sync_init(void)
+{
+ return register_pernet_subsys(&ipvs_sync_ops);
+}
+
+void ip_vs_sync_cleanup(void)
+{
+ unregister_pernet_subsys(&ipvs_sync_ops);
+}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 5325a3fbe4a..1f2a4e35fb1 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -175,7 +175,6 @@ __ip_vs_reroute_locally(struct sk_buff *skb)
.fl4_tos = RT_TOS(iph->tos),
.mark = skb->mark,
};
- struct rtable *rt;
if (ip_route_output_key(net, &rt, &fl))
return 0;
@@ -390,7 +389,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+ if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+ !skb_is_gso(skb)) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -443,7 +443,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -543,7 +543,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+ if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+ !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
"ip_vs_nat_xmit(): frag needed for");
@@ -658,7 +659,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -773,8 +774,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
df |= (old_iph->frag_off & htons(IP_DF));
- if ((old_iph->frag_off & htons(IP_DF))
- && mtu < ntohs(old_iph->tot_len)) {
+ if ((old_iph->frag_off & htons(IP_DF) &&
+ mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error_put;
@@ -886,7 +887,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
- if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+ if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
+ !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -991,7 +993,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
+ if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
+ !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -1158,7 +1161,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
+ if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
+ !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error_put;
@@ -1272,7 +1276,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
new file mode 100644
index 00000000000..4e99cca6161
--- /dev/null
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -0,0 +1,82 @@
+/*
+ * broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <net/route.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+int nf_conntrack_broadcast_help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int timeout)
+{
+ struct nf_conntrack_expect *exp;
+ struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt = skb_rtable(skb);
+ struct in_device *in_dev;
+ struct nf_conn_help *help = nfct_help(ct);
+ __be32 mask = 0;
+
+ /* we're only interested in locally generated packets */
+ if (skb->sk == NULL)
+ goto out;
+ if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+ goto out;
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (in_dev != NULL) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_broadcast == iph->daddr) {
+ mask = ifa->ifa_mask;
+ break;
+ }
+ } endfor_ifa(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (mask == 0)
+ goto out;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL)
+ goto out;
+
+ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+ exp->mask.src.u3.ip = mask;
+ exp->mask.src.u.udp.port = htons(0xFFFF);
+
+ exp->expectfn = NULL;
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
+ exp->helper = NULL;
+
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+
+ nf_ct_refresh(ct, skb, timeout * HZ);
+out:
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index e61511929c6..2f454efa1a8 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -43,6 +43,7 @@
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
@@ -282,6 +283,11 @@ EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
static void death_by_timeout(unsigned long ul_conntrack)
{
struct nf_conn *ct = (void *)ul_conntrack;
+ struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp && tstamp->stop == 0)
+ tstamp->stop = ktime_to_ns(ktime_get_real());
if (!test_bit(IPS_DYING_BIT, &ct->status) &&
unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
@@ -419,6 +425,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
+ struct nf_conn_tstamp *tstamp;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
@@ -486,8 +493,16 @@ __nf_conntrack_confirm(struct sk_buff *skb)
ct->timeout.expires += jiffies;
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
- set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ ct->status |= IPS_CONFIRMED;
+
+ /* set conntrack timestamp, if enabled. */
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp((struct sk_buff *)skb);
+ tstamp->start = ktime_to_ns(skb->tstamp);
+ }
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above
@@ -655,7 +670,8 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
* and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
*/
memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
- sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
+ offsetof(struct nf_conn, proto) -
+ offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
spin_lock_init(&ct->lock);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
@@ -745,6 +761,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
@@ -942,8 +959,15 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_REPLY, ct);
out:
- if (tmpl)
- nf_ct_put(tmpl);
+ if (tmpl) {
+ /* Special case: we have to repeat this hook, assign the
+ * template again to this packet. We assume that this packet
+ * has no conntrack assigned. This is used by nf_ct_tcp. */
+ if (ret == NF_REPEAT)
+ skb->nfct = (struct nf_conntrack *)tmpl;
+ else
+ nf_ct_put(tmpl);
+ }
return ret;
}
@@ -1185,6 +1209,11 @@ struct __nf_ct_flush_report {
static int kill_report(struct nf_conn *i, void *data)
{
struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
+ struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(i);
+ if (tstamp && tstamp->stop == 0)
+ tstamp->stop = ktime_to_ns(ktime_get_real());
/* If we fail to deliver the event, death_by_timeout() will retry */
if (nf_conntrack_event_report(IPCT_DESTROY, i,
@@ -1201,9 +1230,9 @@ static int kill_all(struct nf_conn *i, void *data)
return 1;
}
-void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size)
+void nf_ct_free_hashtable(void *hash, unsigned int size)
{
- if (vmalloced)
+ if (is_vmalloc_addr(hash))
vfree(hash);
else
free_pages((unsigned long)hash,
@@ -1270,8 +1299,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
goto i_see_dead_people;
}
- nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
- net->ct.htable_size);
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
nf_conntrack_ecache_fini(net);
nf_conntrack_acct_fini(net);
nf_conntrack_expect_fini(net);
@@ -1300,21 +1328,18 @@ void nf_conntrack_cleanup(struct net *net)
}
}
-void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls)
+void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
{
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
size_t sz;
- *vmalloced = 0;
-
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
sz = nr_slots * sizeof(struct hlist_nulls_head);
hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
get_order(sz));
if (!hash) {
- *vmalloced = 1;
printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
@@ -1330,7 +1355,7 @@ EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
{
- int i, bucket, vmalloced, old_vmalloced;
+ int i, bucket;
unsigned int hashsize, old_size;
struct hlist_nulls_head *hash, *old_hash;
struct nf_conntrack_tuple_hash *h;
@@ -1347,7 +1372,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
if (!hashsize)
return -EINVAL;
- hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1);
+ hash = nf_ct_alloc_hashtable(&hashsize, 1);
if (!hash)
return -ENOMEM;
@@ -1369,15 +1394,13 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
}
}
old_size = init_net.ct.htable_size;
- old_vmalloced = init_net.ct.hash_vmalloc;
old_hash = init_net.ct.hash;
init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
- init_net.ct.hash_vmalloc = vmalloced;
init_net.ct.hash = hash;
spin_unlock_bh(&nf_conntrack_lock);
- nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
+ nf_ct_free_hashtable(old_hash, old_size);
return 0;
}
EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
@@ -1490,8 +1513,7 @@ static int nf_conntrack_init_net(struct net *net)
}
net->ct.htable_size = nf_conntrack_htable_size;
- net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size,
- &net->ct.hash_vmalloc, 1);
+ net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
if (!net->ct.hash) {
ret = -ENOMEM;
printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
@@ -1503,6 +1525,9 @@ static int nf_conntrack_init_net(struct net *net)
ret = nf_conntrack_acct_init(net);
if (ret < 0)
goto err_acct;
+ ret = nf_conntrack_tstamp_init(net);
+ if (ret < 0)
+ goto err_tstamp;
ret = nf_conntrack_ecache_init(net);
if (ret < 0)
goto err_ecache;
@@ -1510,12 +1535,13 @@ static int nf_conntrack_init_net(struct net *net)
return 0;
err_ecache:
+ nf_conntrack_tstamp_fini(net);
+err_tstamp:
nf_conntrack_acct_fini(net);
err_acct:
nf_conntrack_expect_fini(net);
err_expect:
- nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
- net->ct.htable_size);
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
err_hash:
kmem_cache_destroy(net->ct.nf_conntrack_cachep);
err_cache:
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 5702de35e2b..63a1b915a7e 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -63,6 +63,9 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
* this does not harm and it happens very rarely. */
unsigned long missed = e->missed;
+ if (!((events | missed) & e->ctmask))
+ goto out_unlock;
+
ret = notify->fcn(events | missed, &item);
if (unlikely(ret < 0 || missed)) {
spin_lock_bh(&ct->lock);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index a20fb0bd1ef..cd1e8e0970f 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -319,7 +319,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
const struct nf_conntrack_expect_policy *p;
unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
- atomic_inc(&exp->use);
+ /* two references : one for hash insert, one for the timer */
+ atomic_add(2, &exp->use);
if (master_help) {
hlist_add_head(&exp->lnode, &master_help->expectations);
@@ -333,12 +334,14 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
(unsigned long)exp);
if (master_help) {
- p = &master_help->helper->expect_policy[exp->class];
+ p = &rcu_dereference_protected(
+ master_help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ )->expect_policy[exp->class];
exp->timeout.expires = jiffies + p->timeout * HZ;
}
add_timer(&exp->timeout);
- atomic_inc(&exp->use);
NF_CT_STAT_INC(net, expect_create);
}
@@ -369,7 +372,10 @@ static inline int refresh_timer(struct nf_conntrack_expect *i)
if (!del_timer(&i->timeout))
return 0;
- p = &master_help->helper->expect_policy[i->class];
+ p = &rcu_dereference_protected(
+ master_help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ )->expect_policy[i->class];
i->timeout.expires = jiffies + p->timeout * HZ;
add_timer(&i->timeout);
return 1;
@@ -407,7 +413,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
/* Will be over limit? */
if (master_help) {
- p = &master_help->helper->expect_policy[expect->class];
+ p = &rcu_dereference_protected(
+ master_help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ )->expect_policy[expect->class];
if (p->max_expected &&
master_help->expecting[expect->class] >= p->max_expected) {
evict_oldest_expect(master, expect);
@@ -478,7 +487,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
if (n)
return n;
}
@@ -491,11 +500,11 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
}
return head;
}
@@ -630,8 +639,7 @@ int nf_conntrack_expect_init(struct net *net)
}
net->ct.expect_count = 0;
- net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
- &net->ct.expect_vmalloc, 0);
+ net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
if (net->ct.expect_hash == NULL)
goto err1;
@@ -653,8 +661,7 @@ err3:
if (net_eq(net, &init_net))
kmem_cache_destroy(nf_ct_expect_cachep);
err2:
- nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
- nf_ct_expect_hsize);
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
err1:
return err;
}
@@ -666,6 +673,5 @@ void nf_conntrack_expect_fini(struct net *net)
rcu_barrier(); /* Wait for call_rcu() before destroy */
kmem_cache_destroy(nf_ct_expect_cachep);
}
- nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
- nf_ct_expect_hsize);
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index bd82450c193..80a23ed62bb 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -140,15 +140,16 @@ static void update_alloc_size(struct nf_ct_ext_type *type)
/* This assumes that extended areas in conntrack for the types
whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
for (i = min; i <= max; i++) {
- t1 = nf_ct_ext_types[i];
+ t1 = rcu_dereference_protected(nf_ct_ext_types[i],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
if (!t1)
continue;
- t1->alloc_size = sizeof(struct nf_ct_ext)
- + ALIGN(sizeof(struct nf_ct_ext), t1->align)
- + t1->len;
+ t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) +
+ t1->len;
for (j = 0; j < NF_CT_EXT_NUM; j++) {
- t2 = nf_ct_ext_types[j];
+ t2 = rcu_dereference_protected(nf_ct_ext_types[j],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
if (t2 == NULL || t2 == t1 ||
(t2->flags & NF_CT_EXT_F_PREALLOC) == 0)
continue;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 59e1a4cd4e8..1bdfea35795 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -33,7 +33,6 @@ static DEFINE_MUTEX(nf_ct_helper_mutex);
static struct hlist_head *nf_ct_helper_hash __read_mostly;
static unsigned int nf_ct_helper_hsize __read_mostly;
static unsigned int nf_ct_helper_count __read_mostly;
-static int nf_ct_helper_vmalloc;
/* Stupid hash, but collision free for the default registrations of the
@@ -158,7 +157,10 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i,
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
struct nf_conn_help *help = nfct_help(ct);
- if (help && help->helper == me) {
+ if (help && rcu_dereference_protected(
+ help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ ) == me) {
nf_conntrack_event(IPCT_HELPER, ct);
rcu_assign_pointer(help->helper, NULL);
}
@@ -210,7 +212,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
hlist_for_each_entry_safe(exp, n, next,
&net->ct.expect_hash[i], hnode) {
struct nf_conn_help *help = nfct_help(exp->master);
- if ((help->helper == me || exp->helper == me) &&
+ if ((rcu_dereference_protected(
+ help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ ) == me || exp->helper == me) &&
del_timer(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
@@ -261,8 +266,7 @@ int nf_conntrack_helper_init(void)
int err;
nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
- nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize,
- &nf_ct_helper_vmalloc, 0);
+ nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
if (!nf_ct_helper_hash)
return -ENOMEM;
@@ -273,14 +277,12 @@ int nf_conntrack_helper_init(void)
return 0;
err1:
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc,
- nf_ct_helper_hsize);
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
return err;
}
void nf_conntrack_helper_fini(void)
{
nf_ct_extend_unregister(&helper_extend);
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc,
- nf_ct_helper_hsize);
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index aadde018a07..4c8f30a3d6d 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -18,14 +18,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/if_addr.h>
#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <net/route.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_helper.h>
@@ -40,75 +33,26 @@ MODULE_ALIAS("ip_conntrack_netbios_ns");
MODULE_ALIAS_NFCT_HELPER("netbios_ns");
static unsigned int timeout __read_mostly = 3;
-module_param(timeout, uint, 0400);
+module_param(timeout, uint, S_IRUSR);
MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
-static int help(struct sk_buff *skb, unsigned int protoff,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo)
-{
- struct nf_conntrack_expect *exp;
- struct iphdr *iph = ip_hdr(skb);
- struct rtable *rt = skb_rtable(skb);
- struct in_device *in_dev;
- __be32 mask = 0;
-
- /* we're only interested in locally generated packets */
- if (skb->sk == NULL)
- goto out;
- if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
- goto out;
- if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- goto out;
-
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(rt->dst.dev);
- if (in_dev != NULL) {
- for_primary_ifa(in_dev) {
- if (ifa->ifa_broadcast == iph->daddr) {
- mask = ifa->ifa_mask;
- break;
- }
- } endfor_ifa(in_dev);
- }
- rcu_read_unlock();
-
- if (mask == 0)
- goto out;
-
- exp = nf_ct_expect_alloc(ct);
- if (exp == NULL)
- goto out;
-
- exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- exp->tuple.src.u.udp.port = htons(NMBD_PORT);
-
- exp->mask.src.u3.ip = mask;
- exp->mask.src.u.udp.port = htons(0xFFFF);
-
- exp->expectfn = NULL;
- exp->flags = NF_CT_EXPECT_PERMANENT;
- exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
- exp->helper = NULL;
-
- nf_ct_expect_related(exp);
- nf_ct_expect_put(exp);
-
- nf_ct_refresh(ct, skb, timeout * HZ);
-out:
- return NF_ACCEPT;
-}
-
static struct nf_conntrack_expect_policy exp_policy = {
.max_expected = 1,
};
+static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+}
+
static struct nf_conntrack_helper helper __read_mostly = {
.name = "netbios-ns",
- .tuple.src.l3num = AF_INET,
+ .tuple.src.l3num = NFPROTO_IPV4,
.tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
.tuple.dst.protonum = IPPROTO_UDP,
.me = THIS_MODULE,
- .help = help,
+ .help = netbios_ns_help,
.expect_policy = &exp_policy,
};
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2b7eef37875..30bf8a167fc 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -42,6 +42,7 @@
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
#ifdef CONFIG_NF_NAT_NEEDED
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_protocol.h>
@@ -230,6 +231,33 @@ nla_put_failure:
return -1;
}
+static int
+ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_count;
+ const struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (!tstamp)
+ return 0;
+
+ nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
+ if (!nest_count)
+ goto nla_put_failure;
+
+ NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start));
+ if (tstamp->stop != 0) {
+ NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP,
+ cpu_to_be64(tstamp->stop));
+ }
+ nla_nest_end(skb, nest_count);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
#ifdef CONFIG_NF_CONNTRACK_MARK
static inline int
ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
@@ -404,6 +432,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
ctnetlink_dump_timeout(skb, ct) < 0 ||
ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
ctnetlink_dump_protoinfo(skb, ct) < 0 ||
ctnetlink_dump_helpinfo(skb, ct) < 0 ||
ctnetlink_dump_mark(skb, ct) < 0 ||
@@ -471,6 +500,18 @@ ctnetlink_secctx_size(const struct nf_conn *ct)
}
static inline size_t
+ctnetlink_timestamp_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+ return 0;
+ return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
+#else
+ return 0;
+#endif
+}
+
+static inline size_t
ctnetlink_nlmsg_size(const struct nf_conn *ct)
{
return NLMSG_ALIGN(sizeof(struct nfgenmsg))
@@ -481,6 +522,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+ ctnetlink_counters_size(ct)
+ + ctnetlink_timestamp_size(ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ nla_total_size(0) /* CTA_PROTOINFO */
+ nla_total_size(0) /* CTA_HELP */
@@ -571,7 +613,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
if (events & (1 << IPCT_DESTROY)) {
if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0)
goto nla_put_failure;
} else {
if (ctnetlink_dump_timeout(skb, ct) < 0)
@@ -667,6 +710,7 @@ restart:
if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
IPCTNL_MSG_CT_NEW, ct) < 0) {
+ nf_conntrack_get(&ct->ct_general);
cb->args[1] = (unsigned long)ct;
goto out;
}
@@ -760,7 +804,7 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
static int
ctnetlink_parse_tuple(const struct nlattr * const cda[],
struct nf_conntrack_tuple *tuple,
- enum ctattr_tuple type, u_int8_t l3num)
+ enum ctattr_type type, u_int8_t l3num)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
@@ -924,7 +968,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
u16 zone;
int err;
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP)
+ if (nlh->nlmsg_flags & NLM_F_DUMP)
return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table,
ctnetlink_done);
@@ -1357,6 +1401,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
}
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
/* we must add conntrack extensions before confirmation. */
ct->status |= IPS_CONFIRMED;
@@ -1375,6 +1420,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
}
#endif
+ memset(&ct->proto, 0, sizeof(ct->proto));
if (cda[CTA_PROTOINFO]) {
err = ctnetlink_change_protoinfo(ct, cda);
if (err < 0)
@@ -1787,7 +1833,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
u16 zone;
int err;
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
return netlink_dump_start(ctnl, skb, nlh,
ctnetlink_exp_dump_table,
ctnetlink_exp_done);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index dc7bb74110d..5701c8dd783 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -166,6 +166,7 @@ static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto
int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
{
int ret = 0;
+ struct nf_conntrack_l3proto *old;
if (proto->l3proto >= AF_MAX)
return -EBUSY;
@@ -174,7 +175,9 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
return -EINVAL;
mutex_lock(&nf_ct_proto_mutex);
- if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) {
+ old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex));
+ if (old != &nf_conntrack_l3proto_generic) {
ret = -EBUSY;
goto out_unlock;
}
@@ -201,7 +204,9 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
BUG_ON(proto->l3proto >= AF_MAX);
mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(nf_ct_l3protos[proto->l3proto] != proto);
+ BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != proto);
rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
&nf_conntrack_l3proto_generic);
nf_ct_l3proto_unregister_sysctl(proto);
@@ -279,7 +284,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
mutex_lock(&nf_ct_proto_mutex);
if (!nf_ct_protos[l4proto->l3proto]) {
/* l3proto may be loaded latter. */
- struct nf_conntrack_l4proto **proto_array;
+ struct nf_conntrack_l4proto __rcu **proto_array;
int i;
proto_array = kmalloc(MAX_NF_CT_PROTO *
@@ -291,7 +296,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
}
for (i = 0; i < MAX_NF_CT_PROTO; i++)
- proto_array[i] = &nf_conntrack_l4proto_generic;
+ RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
/* Before making proto_array visible to lockless readers,
* we must make sure its content is committed to memory.
@@ -299,8 +304,10 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
smp_wmb();
nf_ct_protos[l4proto->l3proto] = proto_array;
- } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] !=
- &nf_conntrack_l4proto_generic) {
+ } else if (rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != &nf_conntrack_l4proto_generic) {
ret = -EBUSY;
goto out_unlock;
}
@@ -331,7 +338,10 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
BUG_ON(l4proto->l3proto >= PF_MAX);
mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto);
+ BUG_ON(rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != l4proto);
rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
&nf_conntrack_l4proto_generic);
nf_ct_l4proto_unregister_sysctl(l4proto);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 5292560d6d4..9ae57c57c50 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -452,6 +452,9 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
ct->proto.dccp.state = CT_DCCP_NONE;
+ ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
+ ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
+ ct->proto.dccp.handshake_seq = 0;
return true;
out_invalid:
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index c6049c2d5ea..6f4ee70f460 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -413,6 +413,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
test_bit(SCTP_CID_COOKIE_ACK, map))
return false;
+ memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
new_state = SCTP_CONNTRACK_MAX;
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
/* Don't need lock here: this conntrack not in circulation yet */
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 3fb2b73b24d..6f38d0e2ea4 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1066,9 +1066,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
BUG_ON(th == NULL);
/* Don't need lock here: this conntrack not in circulation yet */
- new_state
- = tcp_conntracks[0][get_conntrack_index(th)]
- [TCP_CONNTRACK_NONE];
+ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
/* Invalid: delete conntrack */
if (new_state >= TCP_CONNTRACK_MAX) {
@@ -1077,6 +1075,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
}
if (new_state == TCP_CONNTRACK_SYN_SENT) {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
/* SYN packet */
ct->proto.tcp.seen[0].td_end =
segment_seq_plus_len(ntohl(th->seq), skb->len,
@@ -1088,11 +1087,11 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.tcp.seen[0].td_end;
tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
- ct->proto.tcp.seen[1].flags = 0;
} else if (nf_ct_tcp_loose == 0) {
/* Don't try to pick up connections. */
return false;
} else {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
/*
* We are in the middle of a connection,
* its history is lost for us.
@@ -1107,7 +1106,6 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.tcp.seen[0].td_maxend =
ct->proto.tcp.seen[0].td_end +
ct->proto.tcp.seen[0].td_maxwin;
- ct->proto.tcp.seen[0].td_scale = 0;
/* We assume SACK and liberal window checking to handle
* window scaling */
@@ -1116,13 +1114,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
IP_CT_TCP_FLAG_BE_LIBERAL;
}
- ct->proto.tcp.seen[1].td_end = 0;
- ct->proto.tcp.seen[1].td_maxend = 0;
- ct->proto.tcp.seen[1].td_maxwin = 0;
- ct->proto.tcp.seen[1].td_scale = 0;
-
/* tcp_packet will set them */
- ct->proto.tcp.state = TCP_CONNTRACK_NONE;
ct->proto.tcp.last_index = TCP_NONE_SET;
pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
new file mode 100644
index 00000000000..6e545e26289
--- /dev/null
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -0,0 +1,77 @@
+/*
+ * SNMP service broadcast connection tracking helper
+ *
+ * (c) 2011 Jiri Olsa <jolsa@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+#define SNMP_PORT 161
+
+MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>");
+MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFCT_HELPER("snmp");
+
+static unsigned int timeout __read_mostly = 30;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+int (*nf_nat_snmp_hook)(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL_GPL(nf_nat_snmp_hook);
+
+static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ typeof(nf_nat_snmp_hook) nf_nat_snmp;
+
+ nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+
+ nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook);
+ if (nf_nat_snmp && ct->status & IPS_NAT_MASK)
+ return nf_nat_snmp(skb, protoff, ct, ctinfo);
+
+ return NF_ACCEPT;
+}
+
+static struct nf_conntrack_expect_policy exp_policy = {
+ .max_expected = 1,
+};
+
+static struct nf_conntrack_helper helper __read_mostly = {
+ .name = "snmp",
+ .tuple.src.l3num = NFPROTO_IPV4,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .help = snmp_conntrack_help,
+ .expect_policy = &exp_policy,
+};
+
+static int __init nf_conntrack_snmp_init(void)
+{
+ exp_policy.timeout = timeout;
+ return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_snmp_fini(void)
+{
+ nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_snmp_init);
+module_exit(nf_conntrack_snmp_fini);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index b4d7f0f24b2..0ae14282588 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -29,6 +29,8 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <linux/rculist_nulls.h>
MODULE_LICENSE("GPL");
@@ -45,6 +47,7 @@ EXPORT_SYMBOL_GPL(print_tuple);
struct ct_iter_state {
struct seq_net_private p;
unsigned int bucket;
+ u_int64_t time_now;
};
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
@@ -56,7 +59,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket++) {
- n = rcu_dereference(net->ct.hash[st->bucket].first);
+ n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -69,13 +72,15 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
- head = rcu_dereference(net->ct.hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(
+ &net->ct.hash[st->bucket]));
}
return head;
}
@@ -93,6 +98,9 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
+ struct ct_iter_state *st = seq->private;
+
+ st->time_now = ktime_to_ns(ktime_get_real());
rcu_read_lock();
return ct_get_idx(seq, *pos);
}
@@ -132,6 +140,34 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
}
#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+ struct ct_iter_state *st = s->private;
+ struct nf_conn_tstamp *tstamp;
+ s64 delta_time;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ delta_time = st->time_now - tstamp->start;
+ if (delta_time > 0)
+ delta_time = div_s64(delta_time, NSEC_PER_SEC);
+ else
+ delta_time = 0;
+
+ return seq_printf(s, "delta-time=%llu ",
+ (unsigned long long)delta_time);
+ }
+ return 0;
+}
+#else
+static inline int
+ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+ return 0;
+}
+#endif
+
/* return 0 on success, 1 in case of error */
static int ct_seq_show(struct seq_file *s, void *v)
{
@@ -200,6 +236,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#endif
+ if (ct_show_delta_time(s, ct))
+ goto release;
+
if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
goto release;
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
new file mode 100644
index 00000000000..af7dd31af0a
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -0,0 +1,120 @@
+/*
+ * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+
+static int nf_ct_tstamp __read_mostly;
+
+module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
+MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tstamp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_timestamp",
+ .data = &init_net.ct.sysctl_tstamp,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type tstamp_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_tstamp),
+ .align = __alignof__(struct nf_conn_tstamp),
+ .id = NF_CT_EXT_TSTAMP,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_tstamp;
+
+ net->ct.tstamp_sysctl_header = register_net_sysctl_table(net,
+ nf_net_netfilter_sysctl_path, table);
+ if (!net->ct.tstamp_sysctl_header) {
+ printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.tstamp_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.tstamp_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_tstamp_init(struct net *net)
+{
+ int ret;
+
+ net->ct.sysctl_tstamp = nf_ct_tstamp;
+
+ if (net_eq(net, &init_net)) {
+ ret = nf_ct_extend_register(&tstamp_extend);
+ if (ret < 0) {
+ printk(KERN_ERR "nf_ct_tstamp: Unable to register "
+ "extension\n");
+ goto out_extend_register;
+ }
+ }
+
+ ret = nf_conntrack_tstamp_init_sysctl(net);
+ if (ret < 0)
+ goto out_sysctl;
+
+ return 0;
+
+out_sysctl:
+ if (net_eq(net, &init_net))
+ nf_ct_extend_unregister(&tstamp_extend);
+out_extend_register:
+ return ret;
+}
+
+void nf_conntrack_tstamp_fini(struct net *net)
+{
+ nf_conntrack_tstamp_fini_sysctl(net);
+ if (net_eq(net, &init_net))
+ nf_ct_extend_unregister(&tstamp_extend);
+}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index b07393eab88..20c775cff2a 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -161,7 +161,8 @@ static int seq_show(struct seq_file *s, void *v)
struct nf_logger *t;
int ret;
- logger = nf_loggers[*pos];
+ logger = rcu_dereference_protected(nf_loggers[*pos],
+ lockdep_is_held(&nf_log_mutex));
if (!logger)
ret = seq_printf(s, "%2lld NONE (", *pos);
@@ -249,7 +250,8 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
mutex_unlock(&nf_log_mutex);
} else {
mutex_lock(&nf_log_mutex);
- logger = nf_loggers[tindex];
+ logger = rcu_dereference_protected(nf_loggers[tindex],
+ lockdep_is_held(&nf_log_mutex));
if (!logger)
table->data = "NONE";
else
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 74aebed5bd2..5ab22e2bbd7 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -27,14 +27,17 @@ static DEFINE_MUTEX(queue_handler_mutex);
int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
{
int ret;
+ const struct nf_queue_handler *old;
if (pf >= ARRAY_SIZE(queue_handler))
return -EINVAL;
mutex_lock(&queue_handler_mutex);
- if (queue_handler[pf] == qh)
+ old = rcu_dereference_protected(queue_handler[pf],
+ lockdep_is_held(&queue_handler_mutex));
+ if (old == qh)
ret = -EEXIST;
- else if (queue_handler[pf])
+ else if (old)
ret = -EBUSY;
else {
rcu_assign_pointer(queue_handler[pf], qh);
@@ -49,11 +52,15 @@ EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
{
+ const struct nf_queue_handler *old;
+
if (pf >= ARRAY_SIZE(queue_handler))
return -EINVAL;
mutex_lock(&queue_handler_mutex);
- if (queue_handler[pf] && queue_handler[pf] != qh) {
+ old = rcu_dereference_protected(queue_handler[pf],
+ lockdep_is_held(&queue_handler_mutex));
+ if (old && old != qh) {
mutex_unlock(&queue_handler_mutex);
return -EINVAL;
}
@@ -73,7 +80,10 @@ void nf_unregister_queue_handlers(const struct nf_queue_handler *qh)
mutex_lock(&queue_handler_mutex);
for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) {
- if (queue_handler[pf] == qh)
+ if (rcu_dereference_protected(
+ queue_handler[pf],
+ lockdep_is_held(&queue_handler_mutex)
+ ) == qh)
rcu_assign_pointer(queue_handler[pf], NULL);
}
mutex_unlock(&queue_handler_mutex);
@@ -115,7 +125,7 @@ static int __nf_queue(struct sk_buff *skb,
int (*okfn)(struct sk_buff *),
unsigned int queuenum)
{
- int status;
+ int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
#ifdef CONFIG_BRIDGE_NETFILTER
struct net_device *physindev;
@@ -128,16 +138,20 @@ static int __nf_queue(struct sk_buff *skb,
rcu_read_lock();
qh = rcu_dereference(queue_handler[pf]);
- if (!qh)
+ if (!qh) {
+ status = -ESRCH;
goto err_unlock;
+ }
afinfo = nf_get_afinfo(pf);
if (!afinfo)
goto err_unlock;
entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
- if (!entry)
+ if (!entry) {
+ status = -ENOMEM;
goto err_unlock;
+ }
*entry = (struct nf_queue_entry) {
.skb = skb,
@@ -151,11 +165,9 @@ static int __nf_queue(struct sk_buff *skb,
/* If it's going away, ignore hook. */
if (!try_module_get(entry->elem->owner)) {
- rcu_read_unlock();
- kfree(entry);
- return 0;
+ status = -ECANCELED;
+ goto err_unlock;
}
-
/* Bump dev refs so they don't vanish while packet is out */
if (indev)
dev_hold(indev);
@@ -182,14 +194,13 @@ static int __nf_queue(struct sk_buff *skb,
goto err;
}
- return 1;
+ return 0;
err_unlock:
rcu_read_unlock();
err:
- kfree_skb(skb);
kfree(entry);
- return 1;
+ return status;
}
int nf_queue(struct sk_buff *skb,
@@ -201,6 +212,8 @@ int nf_queue(struct sk_buff *skb,
unsigned int queuenum)
{
struct sk_buff *segs;
+ int err;
+ unsigned int queued;
if (!skb_is_gso(skb))
return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
@@ -216,20 +229,35 @@ int nf_queue(struct sk_buff *skb,
}
segs = skb_gso_segment(skb, 0);
- kfree_skb(skb);
+ /* Does not use PTR_ERR to limit the number of error codes that can be
+ * returned by nf_queue. For instance, callers rely on -ECANCELED to mean
+ * 'ignore this hook'.
+ */
if (IS_ERR(segs))
- return 1;
+ return -EINVAL;
+ queued = 0;
+ err = 0;
do {
struct sk_buff *nskb = segs->next;
segs->next = NULL;
- if (!__nf_queue(segs, elem, pf, hook, indev, outdev, okfn,
- queuenum))
+ if (err == 0)
+ err = __nf_queue(segs, elem, pf, hook, indev,
+ outdev, okfn, queuenum);
+ if (err == 0)
+ queued++;
+ else
kfree_skb(segs);
segs = nskb;
} while (segs);
- return 1;
+
+ /* also free orig skb if only some segments were queued */
+ if (unlikely(err && queued))
+ err = 0;
+ if (err == 0)
+ kfree_skb(skb);
+ return err;
}
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
@@ -237,6 +265,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
struct sk_buff *skb = entry->skb;
struct list_head *elem = &entry->elem->list;
const struct nf_afinfo *afinfo;
+ int err;
rcu_read_lock();
@@ -270,10 +299,17 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
local_bh_enable();
break;
case NF_QUEUE:
- if (!__nf_queue(skb, elem, entry->pf, entry->hook,
- entry->indev, entry->outdev, entry->okfn,
- verdict >> NF_VERDICT_BITS))
- goto next_hook;
+ err = __nf_queue(skb, elem, entry->pf, entry->hook,
+ entry->indev, entry->outdev, entry->okfn,
+ verdict >> NF_VERDICT_QBITS);
+ if (err < 0) {
+ if (err == -ECANCELED)
+ goto next_hook;
+ if (err == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
break;
case NF_STOLEN:
default:
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 4d87befb04c..474d621cbc2 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -28,26 +28,23 @@ nf_tproxy_destructor(struct sk_buff *skb)
skb->destructor = NULL;
if (sk)
- nf_tproxy_put_sock(sk);
+ sock_put(sk);
}
/* consumes sk */
-int
+void
nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
{
- bool transparent = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_transparent :
- inet_sk(sk)->transparent;
-
- if (transparent) {
- skb_orphan(skb);
- skb->sk = sk;
- skb->destructor = nf_tproxy_destructor;
- return 1;
- } else
- nf_tproxy_put_sock(sk);
-
- return 0;
+ /* assigning tw sockets complicates things; most
+ * skb->sk->X checks would have to test sk->sk_state first */
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ inet_twsk_put(inet_twsk(sk));
+ return;
+ }
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = nf_tproxy_destructor;
}
EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6a1572b0ab4..91592da504b 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -874,19 +874,19 @@ static struct hlist_node *get_first(struct iter_state *st)
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
if (!hlist_empty(&instance_table[st->bucket]))
- return rcu_dereference_bh(instance_table[st->bucket].first);
+ return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
}
return NULL;
}
static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
{
- h = rcu_dereference_bh(h->next);
+ h = rcu_dereference_bh(hlist_next_rcu(h));
while (!h) {
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = rcu_dereference_bh(instance_table[st->bucket].first);
+ h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
}
return h;
}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 68e67d19724..b83123f12b4 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -387,25 +387,31 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
struct sk_buff *nskb;
struct nfqnl_instance *queue;
- int err;
+ int err = -ENOBUFS;
/* rcu_read_lock()ed by nf_hook_slow() */
queue = instance_lookup(queuenum);
- if (!queue)
+ if (!queue) {
+ err = -ESRCH;
goto err_out;
+ }
- if (queue->copy_mode == NFQNL_COPY_NONE)
+ if (queue->copy_mode == NFQNL_COPY_NONE) {
+ err = -EINVAL;
goto err_out;
+ }
nskb = nfqnl_build_packet_message(queue, entry);
- if (nskb == NULL)
+ if (nskb == NULL) {
+ err = -ENOMEM;
goto err_out;
-
+ }
spin_lock_bh(&queue->lock);
- if (!queue->peer_pid)
+ if (!queue->peer_pid) {
+ err = -EINVAL;
goto err_out_free_nskb;
-
+ }
if (queue->queue_total >= queue->queue_maxlen) {
queue->queue_dropped++;
if (net_ratelimit())
@@ -432,7 +438,7 @@ err_out_free_nskb:
err_out_unlock:
spin_unlock_bh(&queue->lock);
err_out:
- return -1;
+ return err;
}
static int
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c9423763107..0a77d2ff215 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -23,6 +23,7 @@
#include <linux/mutex.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/audit.h>
#include <net/net_namespace.h>
#include <linux/netfilter/x_tables.h>
@@ -38,9 +39,8 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
struct compat_delta {
- struct compat_delta *next;
- unsigned int offset;
- int delta;
+ unsigned int offset; /* offset in kernel */
+ int delta; /* delta in 32bit user land */
};
struct xt_af {
@@ -49,7 +49,9 @@ struct xt_af {
struct list_head target;
#ifdef CONFIG_COMPAT
struct mutex compat_mutex;
- struct compat_delta *compat_offsets;
+ struct compat_delta *compat_tab;
+ unsigned int number; /* number of slots in compat_tab[] */
+ unsigned int cur; /* number of used slots in compat_tab[] */
#endif
};
@@ -414,54 +416,67 @@ int xt_check_match(struct xt_mtchk_param *par,
EXPORT_SYMBOL_GPL(xt_check_match);
#ifdef CONFIG_COMPAT
-int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta)
+int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
{
- struct compat_delta *tmp;
+ struct xt_af *xp = &xt[af];
- tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
+ if (!xp->compat_tab) {
+ if (!xp->number)
+ return -EINVAL;
+ xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number);
+ if (!xp->compat_tab)
+ return -ENOMEM;
+ xp->cur = 0;
+ }
- tmp->offset = offset;
- tmp->delta = delta;
+ if (xp->cur >= xp->number)
+ return -EINVAL;
- if (xt[af].compat_offsets) {
- tmp->next = xt[af].compat_offsets->next;
- xt[af].compat_offsets->next = tmp;
- } else {
- xt[af].compat_offsets = tmp;
- tmp->next = NULL;
- }
+ if (xp->cur)
+ delta += xp->compat_tab[xp->cur - 1].delta;
+ xp->compat_tab[xp->cur].offset = offset;
+ xp->compat_tab[xp->cur].delta = delta;
+ xp->cur++;
return 0;
}
EXPORT_SYMBOL_GPL(xt_compat_add_offset);
void xt_compat_flush_offsets(u_int8_t af)
{
- struct compat_delta *tmp, *next;
-
- if (xt[af].compat_offsets) {
- for (tmp = xt[af].compat_offsets; tmp; tmp = next) {
- next = tmp->next;
- kfree(tmp);
- }
- xt[af].compat_offsets = NULL;
+ if (xt[af].compat_tab) {
+ vfree(xt[af].compat_tab);
+ xt[af].compat_tab = NULL;
+ xt[af].number = 0;
}
}
EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);
int xt_compat_calc_jump(u_int8_t af, unsigned int offset)
{
- struct compat_delta *tmp;
- int delta;
-
- for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next)
- if (tmp->offset < offset)
- delta += tmp->delta;
- return delta;
+ struct compat_delta *tmp = xt[af].compat_tab;
+ int mid, left = 0, right = xt[af].cur - 1;
+
+ while (left <= right) {
+ mid = (left + right) >> 1;
+ if (offset > tmp[mid].offset)
+ left = mid + 1;
+ else if (offset < tmp[mid].offset)
+ right = mid - 1;
+ else
+ return mid ? tmp[mid - 1].delta : 0;
+ }
+ WARN_ON_ONCE(1);
+ return 0;
}
EXPORT_SYMBOL_GPL(xt_compat_calc_jump);
+void xt_compat_init_offsets(u_int8_t af, unsigned int number)
+{
+ xt[af].number = number;
+ xt[af].cur = 0;
+}
+EXPORT_SYMBOL(xt_compat_init_offsets);
+
int xt_compat_match_offset(const struct xt_match *match)
{
u_int16_t csize = match->compatsize ? : match->matchsize;
@@ -820,6 +835,21 @@ xt_replace_table(struct xt_table *table,
*/
local_bh_enable();
+#ifdef CONFIG_AUDIT
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_NETFILTER_CFG);
+ if (ab) {
+ audit_log_format(ab, "table=%s family=%u entries=%u",
+ table->name, table->af,
+ private->number);
+ audit_log_end(ab);
+ }
+ }
+#endif
+
return private;
}
EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1338,7 +1368,7 @@ static int __init xt_init(void)
mutex_init(&xt[i].mutex);
#ifdef CONFIG_COMPAT
mutex_init(&xt[i].compat_mutex);
- xt[i].compat_offsets = NULL;
+ xt[i].compat_tab = NULL;
#endif
INIT_LIST_HEAD(&xt[i].target);
INIT_LIST_HEAD(&xt[i].match);
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
new file mode 100644
index 00000000000..81802d27346
--- /dev/null
+++ b/net/netfilter/xt_AUDIT.c
@@ -0,0 +1,204 @@
+/*
+ * Creates audit record for dropped/accepted packets
+ *
+ * (C) 2010-2011 Thomas Graf <tgraf@redhat.com>
+ * (C) 2010-2011 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/audit.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_AUDIT.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>");
+MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets");
+MODULE_ALIAS("ipt_AUDIT");
+MODULE_ALIAS("ip6t_AUDIT");
+MODULE_ALIAS("ebt_AUDIT");
+MODULE_ALIAS("arpt_AUDIT");
+
+static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb,
+ unsigned int proto, unsigned int offset)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE: {
+ const __be16 *pptr;
+ __be16 _ports[2];
+
+ pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports);
+ if (pptr == NULL) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " sport=%hu dport=%hu",
+ ntohs(pptr[0]), ntohs(pptr[1]));
+ }
+ break;
+
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6: {
+ const u8 *iptr;
+ u8 _ih[2];
+
+ iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih);
+ if (iptr == NULL) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu",
+ iptr[0], iptr[1]);
+
+ }
+ break;
+ }
+}
+
+static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (!ih) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu",
+ &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol);
+
+ if (ntohs(ih->frag_off) & IP_OFFSET) {
+ audit_log_format(ab, " frag=1");
+ return;
+ }
+
+ audit_proto(ab, skb, ih->protocol, ih->ihl * 4);
+}
+
+static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct ipv6hdr _ip6h;
+ const struct ipv6hdr *ih;
+ u8 nexthdr;
+ int offset;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+ if (!ih) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ nexthdr = ih->nexthdr;
+ offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h),
+ &nexthdr);
+
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+ &ih->saddr, &ih->daddr, nexthdr);
+
+ if (offset)
+ audit_proto(ab, skb, nexthdr, offset);
+}
+
+static unsigned int
+audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_audit_info *info = par->targinfo;
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+ if (ab == NULL)
+ goto errout;
+
+ audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
+ info->type, par->hooknum, skb->len,
+ par->in ? par->in->name : "?",
+ par->out ? par->out->name : "?");
+
+ if (skb->mark)
+ audit_log_format(ab, " mark=%#x", skb->mark);
+
+ if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+ audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+
+ if (par->family == NFPROTO_BRIDGE) {
+ switch (eth_hdr(skb)->h_proto) {
+ case __constant_htons(ETH_P_IP):
+ audit_ip4(ab, skb);
+ break;
+
+ case __constant_htons(ETH_P_IPV6):
+ audit_ip6(ab, skb);
+ break;
+ }
+ }
+ }
+
+ switch (par->family) {
+ case NFPROTO_IPV4:
+ audit_ip4(ab, skb);
+ break;
+
+ case NFPROTO_IPV6:
+ audit_ip6(ab, skb);
+ break;
+ }
+
+ audit_log_end(ab);
+
+errout:
+ return XT_CONTINUE;
+}
+
+static int audit_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_audit_info *info = par->targinfo;
+
+ if (info->type > XT_AUDIT_TYPE_MAX) {
+ pr_info("Audit type out of range (valid range: 0..%hhu)\n",
+ XT_AUDIT_TYPE_MAX);
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static struct xt_target audit_tg_reg __read_mostly = {
+ .name = "AUDIT",
+ .family = NFPROTO_UNSPEC,
+ .target = audit_tg,
+ .targetsize = sizeof(struct xt_audit_info),
+ .checkentry = audit_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init audit_tg_init(void)
+{
+ return xt_register_target(&audit_tg_reg);
+}
+
+static void __exit audit_tg_exit(void)
+{
+ xt_unregister_target(&audit_tg_reg);
+}
+
+module_init(audit_tg_init);
+module_exit(audit_tg_exit);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index c2c0e4abeb9..af9c4dadf81 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -19,12 +19,14 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_CLASSIFY.h>
+#include <linux/netfilter_arp.h>
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Xtables: Qdisc classification");
MODULE_ALIAS("ipt_CLASSIFY");
MODULE_ALIAS("ip6t_CLASSIFY");
+MODULE_ALIAS("arpt_CLASSIFY");
static unsigned int
classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
@@ -35,26 +37,36 @@ classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static struct xt_target classify_tg_reg __read_mostly = {
- .name = "CLASSIFY",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .table = "mangle",
- .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
- (1 << NF_INET_POST_ROUTING),
- .target = classify_tg,
- .targetsize = sizeof(struct xt_classify_target_info),
- .me = THIS_MODULE,
+static struct xt_target classify_tg_reg[] __read_mostly = {
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_POST_ROUTING),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_ARP,
+ .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
};
static int __init classify_tg_init(void)
{
- return xt_register_target(&classify_tg_reg);
+ return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
}
static void __exit classify_tg_exit(void)
{
- xt_unregister_target(&classify_tg_reg);
+ xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
}
module_init(classify_tg_init);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index be1f22e1354..3bdd443aaf1 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -313,3 +313,5 @@ MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");
MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");
MODULE_DESCRIPTION("Xtables: idle time monitor");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ipt_IDLETIMER");
+MODULE_ALIAS("ip6t_IDLETIMER");
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index a4140509eea..993de2ba89d 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -31,6 +31,8 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>");
MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match");
+MODULE_ALIAS("ipt_LED");
+MODULE_ALIAS("ip6t_LED");
static LIST_HEAD(xt_led_triggers);
static DEFINE_MUTEX(xt_led_mutex);
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 039cce1bde3..d4f4b5d66b2 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -72,18 +72,31 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
if (info->queues_total > 1) {
if (par->family == NFPROTO_IPV4)
- queue = hash_v4(skb) % info->queues_total + queue;
+ queue = (((u64) hash_v4(skb) * info->queues_total) >>
+ 32) + queue;
#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
else if (par->family == NFPROTO_IPV6)
- queue = hash_v6(skb) % info->queues_total + queue;
+ queue = (((u64) hash_v6(skb) * info->queues_total) >>
+ 32) + queue;
#endif
}
return NF_QUEUE_NR(queue);
}
-static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
+static unsigned int
+nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct xt_NFQ_info_v1 *info = par->targinfo;
+ const struct xt_NFQ_info_v2 *info = par->targinfo;
+ unsigned int ret = nfqueue_tg_v1(skb, par);
+
+ if (info->bypass)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+ return ret;
+}
+
+static int nfqueue_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_NFQ_info_v2 *info = par->targinfo;
u32 maxid;
if (unlikely(!rnd_inited)) {
@@ -100,6 +113,8 @@ static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
info->queues_total, maxid);
return -ERANGE;
}
+ if (par->target->revision == 2 && info->bypass > 1)
+ return -EINVAL;
return 0;
}
@@ -115,11 +130,20 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
.name = "NFQUEUE",
.revision = 1,
.family = NFPROTO_UNSPEC,
- .checkentry = nfqueue_tg_v1_check,
+ .checkentry = nfqueue_tg_check,
.target = nfqueue_tg_v1,
.targetsize = sizeof(struct xt_NFQ_info_v1),
.me = THIS_MODULE,
},
+ {
+ .name = "NFQUEUE",
+ .revision = 2,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfqueue_tg_check,
+ .target = nfqueue_tg_v2,
+ .targetsize = sizeof(struct xt_NFQ_info_v2),
+ .me = THIS_MODULE,
+ },
};
static int __init nfqueue_tg_init(void)
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 640678f47a2..dcfd57eb9d0 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -33,6 +33,20 @@
#include <net/netfilter/nf_tproxy_core.h>
#include <linux/netfilter/xt_TPROXY.h>
+static bool tproxy_sk_is_transparent(struct sock *sk)
+{
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ if (inet_sk(sk)->transparent)
+ return true;
+ sock_put(sk);
+ } else {
+ if (inet_twsk(sk)->tw_transparent)
+ return true;
+ inet_twsk_put(inet_twsk(sk));
+ }
+ return false;
+}
+
static inline __be32
tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
{
@@ -141,7 +155,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
skb->dev, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && nf_tproxy_assign_sock(skb, sk)) {
+ if (sk && tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
@@ -149,6 +163,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
iph->protocol, &iph->daddr, ntohs(hp->dest),
&laddr, ntohs(lport), skb->mark);
+
+ nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
@@ -306,7 +322,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
par->in, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && nf_tproxy_assign_sock(skb, sk)) {
+ if (sk && tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
@@ -314,6 +330,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
tproto, &iph->saddr, ntohs(hp->source),
laddr, ntohs(lport), skb->mark);
+
+ nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 5c5b6b921b8..e029c480740 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -185,18 +185,24 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
int connections;
ct = nf_ct_get(skb, &ctinfo);
- if (ct != NULL)
- tuple_ptr = &ct->tuplehash[0].tuple;
- else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- par->family, &tuple))
+ if (ct != NULL) {
+ if (info->flags & XT_CONNLIMIT_DADDR)
+ tuple_ptr = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ else
+ tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ par->family, &tuple)) {
goto hotdrop;
+ }
if (par->family == NFPROTO_IPV6) {
const struct ipv6hdr *iph = ipv6_hdr(skb);
- memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr));
+ memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
+ &iph->daddr : &iph->saddr, sizeof(addr.ip6));
} else {
const struct iphdr *iph = ip_hdr(skb);
- addr.ip = iph->saddr;
+ addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ?
+ iph->daddr : iph->saddr;
}
spin_lock_bh(&info->data->lock);
@@ -204,13 +210,12 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
&info->mask, par->family);
spin_unlock_bh(&info->data->lock);
- if (connections < 0) {
+ if (connections < 0)
/* kmalloc failed, drop it entirely */
- par->hotdrop = true;
- return false;
- }
+ goto hotdrop;
- return (connections > info->limit) ^ info->inverse;
+ return (connections > info->limit) ^
+ !!(info->flags & XT_CONNLIMIT_INVERT);
hotdrop:
par->hotdrop = true;
@@ -268,25 +273,38 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
kfree(info->data);
}
-static struct xt_match connlimit_mt_reg __read_mostly = {
- .name = "connlimit",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = connlimit_mt_check,
- .match = connlimit_mt,
- .matchsize = sizeof(struct xt_connlimit_info),
- .destroy = connlimit_mt_destroy,
- .me = THIS_MODULE,
+static struct xt_match connlimit_mt_reg[] __read_mostly = {
+ {
+ .name = "connlimit",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "connlimit",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
};
static int __init connlimit_mt_init(void)
{
- return xt_register_match(&connlimit_mt_reg);
+ return xt_register_matches(connlimit_mt_reg,
+ ARRAY_SIZE(connlimit_mt_reg));
}
static void __exit connlimit_mt_exit(void)
{
- xt_unregister_match(&connlimit_mt_reg);
+ xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
}
module_init(connlimit_mt_init);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index e536710ad91..4ef1b63ad73 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -112,6 +112,54 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,
return true;
}
+static inline bool
+port_match(u16 min, u16 max, u16 port, bool invert)
+{
+ return (port >= min && port <= max) ^ invert;
+}
+
+static inline bool
+ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info,
+ const struct nf_conn *ct)
+{
+ const struct nf_conntrack_tuple *tuple;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ if ((info->match_flags & XT_CONNTRACK_PROTO) &&
+ (nf_ct_protonum(ct) == info->l4proto) ^
+ !(info->invert_flags & XT_CONNTRACK_PROTO))
+ return false;
+
+ /* Shortcut to match all recognized protocols by using ->src.all. */
+ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) &&
+ !port_match(info->origsrc_port, info->origsrc_port_high,
+ ntohs(tuple->src.u.all),
+ info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) &&
+ !port_match(info->origdst_port, info->origdst_port_high,
+ ntohs(tuple->dst.u.all),
+ info->invert_flags & XT_CONNTRACK_ORIGDST_PORT))
+ return false;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) &&
+ !port_match(info->replsrc_port, info->replsrc_port_high,
+ ntohs(tuple->src.u.all),
+ info->invert_flags & XT_CONNTRACK_REPLSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) &&
+ !port_match(info->repldst_port, info->repldst_port_high,
+ ntohs(tuple->dst.u.all),
+ info->invert_flags & XT_CONNTRACK_REPLDST_PORT))
+ return false;
+
+ return true;
+}
+
static bool
conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
u16 state_mask, u16 status_mask)
@@ -170,8 +218,13 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
!(info->invert_flags & XT_CONNTRACK_REPLDST))
return false;
- if (!ct_proto_port_check(info, ct))
- return false;
+ if (par->match->revision != 3) {
+ if (!ct_proto_port_check(info, ct))
+ return false;
+ } else {
+ if (!ct_proto_port_check_v3(par->matchinfo, ct))
+ return false;
+ }
if ((info->match_flags & XT_CONNTRACK_STATUS) &&
(!!(status_mask & ct->status) ^
@@ -207,6 +260,14 @@ conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
return conntrack_mt(skb, par, info->state_mask, info->status_mask);
}
+static bool
+conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_conntrack_mtinfo3 *info = par->matchinfo;
+
+ return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
static int conntrack_mt_check(const struct xt_mtchk_param *par)
{
int ret;
@@ -244,6 +305,16 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
.destroy = conntrack_mt_destroy,
.me = THIS_MODULE,
},
+ {
+ .name = "conntrack",
+ .revision = 3,
+ .family = NFPROTO_UNSPEC,
+ .matchsize = sizeof(struct xt_conntrack_mtinfo3),
+ .match = conntrack_mt_v3,
+ .checkentry = conntrack_mt_check,
+ .destroy = conntrack_mt_destroy,
+ .me = THIS_MODULE,
+ },
};
static int __init conntrack_mt_init(void)
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
index b39db8a5cba..c7a2e5466bc 100644
--- a/net/netfilter/xt_cpu.c
+++ b/net/netfilter/xt_cpu.c
@@ -22,6 +22,8 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");
MODULE_DESCRIPTION("Xtables: CPU match");
+MODULE_ALIAS("ipt_cpu");
+MODULE_ALIAS("ip6t_cpu");
static int cpu_mt_check(const struct xt_mtchk_param *par)
{
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
new file mode 100644
index 00000000000..d9202cdd25c
--- /dev/null
+++ b/net/netfilter/xt_devgroup.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <linux/netfilter/xt_devgroup.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: Device group match");
+MODULE_ALIAS("ipt_devgroup");
+MODULE_ALIAS("ip6t_devgroup");
+
+static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+ (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^
+ ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0)))
+ return false;
+
+ if (info->flags & XT_DEVGROUP_MATCH_DST &&
+ (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^
+ ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0)))
+ return false;
+
+ return true;
+}
+
+static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
+ XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
+ return -EINVAL;
+
+ if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+ par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD)))
+ return -EINVAL;
+
+ if (info->flags & XT_DEVGROUP_MATCH_DST &&
+ par->hook_mask & ~((1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_match devgroup_mt_reg __read_mostly = {
+ .name = "devgroup",
+ .match = devgroup_mt,
+ .checkentry = devgroup_mt_checkentry,
+ .matchsize = sizeof(struct xt_devgroup_info),
+ .family = NFPROTO_UNSPEC,
+ .me = THIS_MODULE
+};
+
+static int __init devgroup_mt_init(void)
+{
+ return xt_register_match(&devgroup_mt_reg);
+}
+
+static void __exit devgroup_mt_exit(void)
+{
+ xt_unregister_match(&devgroup_mt_reg);
+}
+
+module_init(devgroup_mt_init);
+module_exit(devgroup_mt_exit);
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index 88f7c3511c7..b46626cddd9 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -31,7 +31,7 @@ iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",
&iph->saddr,
(info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
- &info->src_max.ip,
+ &info->src_min.ip,
&info->src_max.ip);
return false;
}
@@ -53,15 +53,13 @@ iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
}
static inline int
-iprange_ipv6_sub(const struct in6_addr *a, const struct in6_addr *b)
+iprange_ipv6_lt(const struct in6_addr *a, const struct in6_addr *b)
{
unsigned int i;
- int r;
for (i = 0; i < 4; ++i) {
- r = ntohl(a->s6_addr32[i]) - ntohl(b->s6_addr32[i]);
- if (r != 0)
- return r;
+ if (a->s6_addr32[i] != b->s6_addr32[i])
+ return ntohl(a->s6_addr32[i]) < ntohl(b->s6_addr32[i]);
}
return 0;
@@ -75,18 +73,30 @@ iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par)
bool m;
if (info->flags & IPRANGE_SRC) {
- m = iprange_ipv6_sub(&iph->saddr, &info->src_min.in6) < 0;
- m |= iprange_ipv6_sub(&iph->saddr, &info->src_max.in6) > 0;
+ m = iprange_ipv6_lt(&iph->saddr, &info->src_min.in6);
+ m |= iprange_ipv6_lt(&info->src_max.in6, &iph->saddr);
m ^= !!(info->flags & IPRANGE_SRC_INV);
- if (m)
+ if (m) {
+ pr_debug("src IP %pI6 NOT in range %s%pI6-%pI6\n",
+ &iph->saddr,
+ (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
+ &info->src_min.in6,
+ &info->src_max.in6);
return false;
+ }
}
if (info->flags & IPRANGE_DST) {
- m = iprange_ipv6_sub(&iph->daddr, &info->dst_min.in6) < 0;
- m |= iprange_ipv6_sub(&iph->daddr, &info->dst_max.in6) > 0;
+ m = iprange_ipv6_lt(&iph->daddr, &info->dst_min.in6);
+ m |= iprange_ipv6_lt(&info->dst_max.in6, &iph->daddr);
m ^= !!(info->flags & IPRANGE_DST_INV);
- if (m)
+ if (m) {
+ pr_debug("dst IP %pI6 NOT in range %s%pI6-%pI6\n",
+ &iph->daddr,
+ (info->flags & IPRANGE_DST_INV) ? "(INV) " : "",
+ &info->dst_min.in6,
+ &info->dst_max.in6);
return false;
+ }
}
return true;
}
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 9127a3d8aa3..bb10b0717f1 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -85,7 +85,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */);
+ cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */);
if (unlikely(cp == NULL)) {
match = false;
goto out;
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
new file mode 100644
index 00000000000..061d48cec13
--- /dev/null
+++ b/net/netfilter/xt_set.c
@@ -0,0 +1,359 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module which implements the set match and SET target
+ * for netfilter/iptables. */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/version.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_set.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("Xtables: IP set match and target module");
+MODULE_ALIAS("xt_SET");
+MODULE_ALIAS("ipt_set");
+MODULE_ALIAS("ip6t_set");
+MODULE_ALIAS("ipt_SET");
+MODULE_ALIAS("ip6t_SET");
+
+static inline int
+match_set(ip_set_id_t index, const struct sk_buff *skb,
+ u8 pf, u8 dim, u8 flags, int inv)
+{
+ if (ip_set_test(index, skb, pf, dim, flags))
+ inv = !inv;
+ return inv;
+}
+
+/* Revision 0 interface: backward compatible with netfilter/iptables */
+
+static bool
+set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v0 *info = par->matchinfo;
+
+ return match_set(info->match_set.index, skb, par->family,
+ info->match_set.u.compat.dim,
+ info->match_set.u.compat.flags,
+ info->match_set.u.compat.flags & IPSET_INV_MATCH);
+}
+
+static void
+compat_flags(struct xt_set_info_v0 *info)
+{
+ u_int8_t i;
+
+ /* Fill out compatibility data according to enum ip_set_kopt */
+ info->u.compat.dim = IPSET_DIM_ZERO;
+ if (info->u.flags[0] & IPSET_MATCH_INV)
+ info->u.compat.flags |= IPSET_INV_MATCH;
+ for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) {
+ info->u.compat.dim++;
+ if (info->u.flags[i] & IPSET_SRC)
+ info->u.compat.flags |= (1<<info->u.compat.dim);
+ }
+}
+
+static int
+set_match_v0_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_set_info_match_v0 *info = par->matchinfo;
+ ip_set_id_t index;
+
+ index = ip_set_nfnl_get_byindex(info->match_set.index);
+
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find set indentified by id %u to match\n",
+ info->match_set.index);
+ return -ENOENT;
+ }
+ if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ pr_warning("Protocol error: set match dimension "
+ "is over the limit!\n");
+ return -ERANGE;
+ }
+
+ /* Fill out compatibility data */
+ compat_flags(&info->match_set);
+
+ return 0;
+}
+
+static void
+set_match_v0_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_set_info_match_v0 *info = par->matchinfo;
+
+ ip_set_nfnl_put(info->match_set.index);
+}
+
+static unsigned int
+set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target_v0 *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index, skb, par->family,
+ info->add_set.u.compat.dim,
+ info->add_set.u.compat.flags);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index, skb, par->family,
+ info->del_set.u.compat.dim,
+ info->del_set.u.compat.flags);
+
+ return XT_CONTINUE;
+}
+
+static int
+set_target_v0_checkentry(const struct xt_tgchk_param *par)
+{
+ struct xt_set_info_target_v0 *info = par->targinfo;
+ ip_set_id_t index;
+
+ if (info->add_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(info->add_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find add_set index %u as target\n",
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->del_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(info->del_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find del_set index %u as target\n",
+ info->del_set.index);
+ return -ENOENT;
+ }
+ }
+ if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 ||
+ info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ pr_warning("Protocol error: SET target dimension "
+ "is over the limit!\n");
+ return -ERANGE;
+ }
+
+ /* Fill out compatibility data */
+ compat_flags(&info->add_set);
+ compat_flags(&info->del_set);
+
+ return 0;
+}
+
+static void
+set_target_v0_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_set_info_target_v0 *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(info->del_set.index);
+}
+
+/* Revision 1: current interface to netfilter/iptables */
+
+static bool
+set_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match *info = par->matchinfo;
+
+ return match_set(info->match_set.index, skb, par->family,
+ info->match_set.dim,
+ info->match_set.flags,
+ info->match_set.flags & IPSET_INV_MATCH);
+}
+
+static int
+set_match_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_set_info_match *info = par->matchinfo;
+ ip_set_id_t index;
+
+ index = ip_set_nfnl_get_byindex(info->match_set.index);
+
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find set indentified by id %u to match\n",
+ info->match_set.index);
+ return -ENOENT;
+ }
+ if (info->match_set.dim > IPSET_DIM_MAX) {
+ pr_warning("Protocol error: set match dimension "
+ "is over the limit!\n");
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static void
+set_match_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_set_info_match *info = par->matchinfo;
+
+ ip_set_nfnl_put(info->match_set.index);
+}
+
+static unsigned int
+set_target(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index,
+ skb, par->family,
+ info->add_set.dim,
+ info->add_set.flags);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index,
+ skb, par->family,
+ info->add_set.dim,
+ info->del_set.flags);
+
+ return XT_CONTINUE;
+}
+
+static int
+set_target_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct xt_set_info_target *info = par->targinfo;
+ ip_set_id_t index;
+
+ if (info->add_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(info->add_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find add_set index %u as target\n",
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->del_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(info->del_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find del_set index %u as target\n",
+ info->del_set.index);
+ return -ENOENT;
+ }
+ }
+ if (info->add_set.dim > IPSET_DIM_MAX ||
+ info->del_set.flags > IPSET_DIM_MAX) {
+ pr_warning("Protocol error: SET target dimension "
+ "is over the limit!\n");
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static void
+set_target_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_set_info_target *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(info->del_set.index);
+}
+
+static struct xt_match set_matches[] __read_mostly = {
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 0,
+ .match = set_match_v0,
+ .matchsize = sizeof(struct xt_set_info_match_v0),
+ .checkentry = set_match_v0_checkentry,
+ .destroy = set_match_v0_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 1,
+ .match = set_match,
+ .matchsize = sizeof(struct xt_set_info_match),
+ .checkentry = set_match_checkentry,
+ .destroy = set_match_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .match = set_match,
+ .matchsize = sizeof(struct xt_set_info_match),
+ .checkentry = set_match_checkentry,
+ .destroy = set_match_destroy,
+ .me = THIS_MODULE
+ },
+};
+
+static struct xt_target set_targets[] __read_mostly = {
+ {
+ .name = "SET",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = set_target_v0,
+ .targetsize = sizeof(struct xt_set_info_target_v0),
+ .checkentry = set_target_v0_checkentry,
+ .destroy = set_target_v0_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .target = set_target,
+ .targetsize = sizeof(struct xt_set_info_target),
+ .checkentry = set_target_checkentry,
+ .destroy = set_target_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .target = set_target,
+ .targetsize = sizeof(struct xt_set_info_target),
+ .checkentry = set_target_checkentry,
+ .destroy = set_target_destroy,
+ .me = THIS_MODULE
+ },
+};
+
+static int __init xt_set_init(void)
+{
+ int ret = xt_register_matches(set_matches, ARRAY_SIZE(set_matches));
+
+ if (!ret) {
+ ret = xt_register_targets(set_targets,
+ ARRAY_SIZE(set_targets));
+ if (ret)
+ xt_unregister_matches(set_matches,
+ ARRAY_SIZE(set_matches));
+ }
+ return ret;
+}
+
+static void __exit xt_set_fini(void)
+{
+ xt_unregister_matches(set_matches, ARRAY_SIZE(set_matches));
+ xt_unregister_targets(set_targets, ARRAY_SIZE(set_targets));
+}
+
+module_init(xt_set_init);
+module_exit(xt_set_fini);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 00d6ae83830..9cc46356b57 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -35,6 +35,15 @@
#include <net/netfilter/nf_conntrack.h>
#endif
+static void
+xt_socket_put_sk(struct sock *sk)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_put(inet_twsk(sk));
+ else
+ sock_put(sk);
+}
+
static int
extract_icmp4_fields(const struct sk_buff *skb,
u8 *protocol,
@@ -164,7 +173,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
(sk->sk_state == TCP_TIME_WAIT &&
inet_twsk(sk)->tw_transparent));
- nf_tproxy_put_sock(sk);
+ xt_socket_put_sk(sk);
if (wildcard || !transparent)
sk = NULL;
@@ -298,7 +307,7 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
(sk->sk_state == TCP_TIME_WAIT &&
inet_twsk(sk)->tw_transparent));
- nf_tproxy_put_sock(sk);
+ xt_socket_put_sk(sk);
if (wildcard || !transparent)
sk = NULL;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f83cb370292..1781d99145e 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -519,7 +519,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_netlink_recv(skb, CAP_NET_ADMIN))
return -EPERM;
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (ops->dumpit == NULL)
return -EOPNOTSUPP;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 91cb1d71f01..5efef5b5879 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -164,7 +164,6 @@ struct packet_mreq_max {
static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
int closing, int tx_ring);
-#define PGV_FROM_VMALLOC 1
struct pgv {
char *buffer;
};
@@ -466,7 +465,7 @@ retry:
*/
err = -EMSGSIZE;
- if (len > dev->mtu + dev->hard_header_len)
+ if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN)
goto out_unlock;
if (!skb) {
@@ -497,6 +496,19 @@ retry:
goto retry;
}
+ if (len > (dev->mtu + dev->hard_header_len)) {
+ /* Earlier code assumed this would be a VLAN pkt,
+ * double-check this now that we have the actual
+ * packet in hand.
+ */
+ struct ethhdr *ehdr;
+ skb_reset_mac_header(skb);
+ ehdr = eth_hdr(skb);
+ if (ehdr->h_proto != htons(ETH_P_8021Q)) {
+ err = -EMSGSIZE;
+ goto out_unlock;
+ }
+ }
skb->protocol = proto;
skb->dev = dev;
@@ -523,11 +535,11 @@ static inline unsigned int run_filter(const struct sk_buff *skb,
{
struct sk_filter *filter;
- rcu_read_lock_bh();
- filter = rcu_dereference_bh(sk->sk_filter);
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
if (filter != NULL)
res = sk_run_filter(skb, filter->insns);
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return res;
}
@@ -1200,7 +1212,7 @@ static int packet_snd(struct socket *sock,
}
err = -EMSGSIZE;
- if (!gso_type && (len > dev->mtu+reserve))
+ if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN))
goto out_unlock;
err = -ENOBUFS;
@@ -1225,6 +1237,20 @@ static int packet_snd(struct socket *sock,
if (err < 0)
goto out_free;
+ if (!gso_type && (len > dev->mtu + reserve)) {
+ /* Earlier code assumed this would be a VLAN pkt,
+ * double-check this now that we have the actual
+ * packet in hand.
+ */
+ struct ethhdr *ehdr;
+ skb_reset_mac_header(skb);
+ ehdr = eth_hdr(skb);
+ if (ehdr->h_proto != htons(ETH_P_8021Q)) {
+ err = -EMSGSIZE;
+ goto out_free;
+ }
+ }
+
skb->protocol = proto;
skb->dev = dev;
skb->priority = sk->sk_priority;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 9542449c072..da8adac2bf0 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,7 +50,6 @@ rdsdebug(char *fmt, ...)
#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
#define RDS_CONG_MAP_BYTES (65536 / 8)
-#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index eaf76587645..7fce6dfd218 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -18,7 +18,7 @@ config RFKILL_LEDS
default y
config RFKILL_INPUT
- bool "RF switch input support" if EMBEDDED
+ bool "RF switch input support" if EXPERT
depends on RFKILL
depends on INPUT = y || RFKILL = INPUT
- default y if !EMBEDDED
+ default y if !EXPERT
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index d952e7eac18..5ee0c62046a 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -803,7 +803,6 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
rose_insert_socket(sk); /* Finish the bind */
}
-rose_try_next_neigh:
rose->dest_addr = addr->srose_addr;
rose->dest_call = addr->srose_call;
rose->rand = ((long)rose & 0xFFFF) + rose->lci;
@@ -865,12 +864,6 @@ rose_try_next_neigh:
}
if (sk->sk_state != TCP_ESTABLISHED) {
- /* Try next neighbour */
- rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause, &diagnostic, 0);
- if (rose->neighbour)
- goto rose_try_next_neigh;
-
- /* No more neighbours */
sock->state = SS_UNCONNECTED;
err = sock_error(sk); /* Always set at this point */
goto out_release;
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index b4fdaac233f..88a77e90e7e 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -674,29 +674,34 @@ struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neig
* Find a neighbour or a route given a ROSE address.
*/
struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause,
- unsigned char *diagnostic, int new)
+ unsigned char *diagnostic, int route_frame)
{
struct rose_neigh *res = NULL;
struct rose_node *node;
int failed = 0;
int i;
- if (!new) spin_lock_bh(&rose_node_list_lock);
+ if (!route_frame) spin_lock_bh(&rose_node_list_lock);
for (node = rose_node_list; node != NULL; node = node->next) {
if (rosecmpm(addr, &node->address, node->mask) == 0) {
for (i = 0; i < node->count; i++) {
- if (new) {
- if (node->neighbour[i]->restarted) {
- res = node->neighbour[i];
- goto out;
- }
+ if (node->neighbour[i]->restarted) {
+ res = node->neighbour[i];
+ goto out;
}
- else {
+ }
+ }
+ }
+ if (!route_frame) { /* connect request */
+ for (node = rose_node_list; node != NULL; node = node->next) {
+ if (rosecmpm(addr, &node->address, node->mask) == 0) {
+ for (i = 0; i < node->count; i++) {
if (!rose_ftimer_running(node->neighbour[i])) {
res = node->neighbour[i];
+ failed = 0;
goto out;
- } else
- failed = 1;
+ }
+ failed = 1;
}
}
}
@@ -711,8 +716,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause,
}
out:
- if (!new) spin_unlock_bh(&rose_node_list_lock);
-
+ if (!route_frame) spin_unlock_bh(&rose_node_list_lock);
return res;
}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index f04d4a484d5..8c19b6e3201 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -205,6 +205,29 @@ config NET_SCH_DRR
If unsure, say N.
+config NET_SCH_MQPRIO
+ tristate "Multi-queue priority scheduler (MQPRIO)"
+ help
+ Say Y here if you want to use the Multi-queue Priority scheduler.
+ This scheduler allows QOS to be offloaded on NICs that have support
+ for offloading QOS schedulers.
+
+ To compile this driver as a module, choose M here: the module will
+ be called sch_mqprio.
+
+ If unsure, say N.
+
+config NET_SCH_CHOKE
+ tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+ help
+ Say Y here if you want to use the CHOKe packet scheduler (CHOose
+ and Keep for responsive flows, CHOose and Kill for unresponsive
+ flows). This is a variation of RED which trys to penalize flows
+ that monopolize the queue.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_choke.
+
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
@@ -243,7 +266,7 @@ config NET_CLS_TCINDEX
config NET_CLS_ROUTE4
tristate "Routing decision (ROUTE)"
- select NET_CLS_ROUTE
+ select IP_ROUTE_CLASSID
select NET_CLS
---help---
If you say Y here, you will be able to classify packets
@@ -252,9 +275,6 @@ config NET_CLS_ROUTE4
To compile this code as a module, choose M here: the
module will be called cls_route.
-config NET_CLS_ROUTE
- bool
-
config NET_CLS_FW
tristate "Netfilter mark (FW)"
select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 960f5dba630..06c6cdfd194 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -32,6 +32,9 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
+obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
+
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 23b25f89e7e..15873e14cb5 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -78,7 +78,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
struct tc_action *a, struct tcf_hashinfo *hinfo)
{
struct tcf_common *p;
- int err = 0, index = -1,i = 0, s_i = 0, n_i = 0;
+ int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
struct nlattr *nest;
read_lock_bh(hinfo->lock);
@@ -126,7 +126,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
{
struct tcf_common *p, *s_p;
struct nlattr *nest;
- int i= 0, n_i = 0;
+ int i = 0, n_i = 0;
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
@@ -138,7 +138,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
while (p != NULL) {
s_p = p->tcfc_next;
if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo))
- module_put(a->ops->owner);
+ module_put(a->ops->owner);
n_i++;
p = s_p;
}
@@ -447,7 +447,8 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
- if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) {
+ err = tcf_action_dump_old(skb, a, bind, ref);
+ if (err > 0) {
nla_nest_end(skb, nest);
return err;
}
@@ -491,7 +492,7 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
struct tc_action *a;
struct tc_action_ops *a_o;
char act_name[IFNAMSIZ];
- struct nlattr *tb[TCA_ACT_MAX+1];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct nlattr *kind;
int err;
@@ -549,9 +550,9 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
goto err_free;
/* module count goes up only when brand new policy is created
- if it exists and is only bound to in a_o->init() then
- ACT_P_CREATED is not returned (a zero is).
- */
+ * if it exists and is only bound to in a_o->init() then
+ * ACT_P_CREATED is not returned (a zero is).
+ */
if (err != ACT_P_CREATED)
module_put(a_o->owner);
a->ops = a_o;
@@ -569,7 +570,7 @@ err_out:
struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind)
{
- struct nlattr *tb[TCA_ACT_MAX_PRIO+1];
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *head = NULL, *act, *act_prev = NULL;
int err;
int i;
@@ -697,7 +698,7 @@ act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n,
static struct tc_action *
tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid)
{
- struct nlattr *tb[TCA_ACT_MAX+1];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct tc_action *a;
int index;
int err;
@@ -770,7 +771,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
struct tcamsg *t;
struct netlink_callback dcb;
struct nlattr *nest;
- struct nlattr *tb[TCA_ACT_MAX+1];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct nlattr *kind;
struct tc_action *a = create_a(0);
int err = -ENOMEM;
@@ -821,7 +822,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
nlh->nlmsg_flags |= NLM_F_ROOT;
module_put(a->ops->owner);
kfree(a);
- err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ err = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
if (err > 0)
return 0;
@@ -842,14 +844,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
u32 pid, int event)
{
int i, ret;
- struct nlattr *tb[TCA_ACT_MAX_PRIO+1];
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *head = NULL, *act, *act_prev = NULL;
ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
if (ret < 0)
return ret;
- if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) {
+ if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
if (tb[1] != NULL)
return tca_action_flush(net, tb[1], n, pid);
else
@@ -892,7 +894,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
/* now do the delete */
tcf_action_destroy(head, 0);
ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
- n->nlmsg_flags&NLM_F_ECHO);
+ n->nlmsg_flags & NLM_F_ECHO);
if (ret > 0)
return 0;
return ret;
@@ -936,7 +938,7 @@ static int tcf_add_notify(struct net *net, struct tc_action *a,
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
NETLINK_CB(skb).dst_group = RTNLGRP_TC;
- err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
+ err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags & NLM_F_ECHO);
if (err > 0)
err = 0;
return err;
@@ -967,7 +969,7 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
/* dump then free all the actions after update; inserted policy
* stays intact
- * */
+ */
ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
for (a = act; a; a = act) {
act = a->next;
@@ -993,8 +995,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
return -EINVAL;
}
- /* n->nlmsg_flags&NLM_F_CREATE
- * */
+ /* n->nlmsg_flags & NLM_F_CREATE */
switch (n->nlmsg_type) {
case RTM_NEWACTION:
/* we are going to assume all other flags
@@ -1003,7 +1004,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
* but since we want avoid ambiguity (eg when flags
* is zero) then just set this
*/
- if (n->nlmsg_flags&NLM_F_REPLACE)
+ if (n->nlmsg_flags & NLM_F_REPLACE)
ovr = 1;
replay:
ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr);
@@ -1028,7 +1029,7 @@ replay:
static struct nlattr *
find_dump_kind(const struct nlmsghdr *n)
{
- struct nlattr *tb1, *tb2[TCA_ACT_MAX+1];
+ struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct nlattr *nla[TCAA_MAX + 1];
struct nlattr *kind;
@@ -1071,9 +1072,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
}
a_o = tc_lookup_action(kind);
- if (a_o == NULL) {
+ if (a_o == NULL)
return 0;
- }
memset(&a, 0, sizeof(struct tc_action));
a.ops = a_o;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 83ddfc07e45..6cdf9abe475 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -63,7 +63,7 @@ static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy);
+ err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);
if (err < 0)
return err;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index c2ed90a4c0b..2b4ab4b05ce 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -50,7 +50,7 @@ static int gact_determ(struct tcf_gact *gact)
}
typedef int (*g_rand)(struct tcf_gact *gact);
-static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ };
+static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
#endif /* CONFIG_GACT_PROB */
static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
@@ -89,7 +89,7 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*gact),
bind, &gact_idx_gen, &gact_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
ret = ACT_P_CREATED;
} else {
if (!ovr) {
@@ -205,9 +205,9 @@ MODULE_LICENSE("GPL");
static int __init gact_init_module(void)
{
#ifdef CONFIG_GACT_PROB
- printk(KERN_INFO "GACT probability on\n");
+ pr_info("GACT probability on\n");
#else
- printk(KERN_INFO "GACT probability NOT on\n");
+ pr_info("GACT probability NOT on\n");
#endif
return tcf_register_action(&act_gact_ops);
}
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index c2a7c20e81c..9fc211a1b20 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -138,7 +138,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind,
&ipt_idx_gen, &ipt_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
ret = ACT_P_CREATED;
} else {
if (!ovr) {
@@ -162,7 +162,8 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
if (unlikely(!t))
goto err2;
- if ((err = ipt_init_target(t, tname, hook)) < 0)
+ err = ipt_init_target(t, tname, hook);
+ if (err < 0)
goto err3;
spin_lock_bh(&ipt->tcf_lock);
@@ -212,8 +213,9 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
bstats_update(&ipt->tcf_bstats, skb);
/* yes, we have to worry about both in and out dev
- worry later - danger - this API seems to have changed
- from earlier kernels */
+ * worry later - danger - this API seems to have changed
+ * from earlier kernels
+ */
par.in = skb->dev;
par.out = NULL;
par.hooknum = ipt->tcfi_hook;
@@ -253,9 +255,9 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
struct tc_cnt c;
/* for simple targets kernel size == user size
- ** user name = target name
- ** for foolproof you need to not assume this
- */
+ * user name = target name
+ * for foolproof you need to not assume this
+ */
t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
if (unlikely(!t))
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index d765067e99d..961386e2f2c 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -41,13 +41,13 @@ static struct tcf_hashinfo mirred_hash_info = {
.lock = &mirred_lock,
};
-static inline int tcf_mirred_release(struct tcf_mirred *m, int bind)
+static int tcf_mirred_release(struct tcf_mirred *m, int bind)
{
if (m) {
if (bind)
m->tcf_bindcnt--;
m->tcf_refcnt--;
- if(!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
+ if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
list_del(&m->tcfm_list);
if (m->tcfm_dev)
dev_put(m->tcfm_dev);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 178a4bd7b7c..762b027650a 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -69,7 +69,7 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
&nat_idx_gen, &nat_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
p = to_tcf_nat(pc);
ret = ACT_P_CREATED;
} else {
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 445bef716f7..50c7c06c019 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -70,7 +70,7 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
&pedit_idx_gen, &pedit_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
p = to_pedit(pc);
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL) {
@@ -127,11 +127,9 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
int i, munged = 0;
unsigned int off;
- if (skb_cloned(skb)) {
- if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
- return p->tcf_action;
- }
- }
+ if (skb_cloned(skb) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ return p->tcf_action;
off = skb_network_offset(skb);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index e2f08b1e2e5..8a1630774fd 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -22,8 +22,8 @@
#include <net/act_api.h>
#include <net/netlink.h>
-#define L2T(p,L) qdisc_l2t((p)->tcfp_R_tab, L)
-#define L2T_P(p,L) qdisc_l2t((p)->tcfp_P_tab, L)
+#define L2T(p, L) qdisc_l2t((p)->tcfp_R_tab, L)
+#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L)
#define POL_TAB_MASK 15
static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1];
@@ -37,8 +37,7 @@ static struct tcf_hashinfo police_hash_info = {
};
/* old policer structure from before tc actions */
-struct tc_police_compat
-{
+struct tc_police_compat {
u32 index;
int action;
u32 limit;
@@ -139,7 +138,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
- unsigned h;
+ unsigned int h;
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
struct tc_police *parm;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 7287cff7af3..a34a22de60b 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -47,7 +47,7 @@ static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result
/* print policy string followed by _ then packet count
* Example if this was the 3rd packet and the string was "hello"
* then it would look like "hello_3" (without quotes)
- **/
+ */
pr_info("simple: %s_%d\n",
(char *)d->tcfd_defdata, d->tcf_bstats.packets);
spin_unlock(&d->tcf_lock);
@@ -125,7 +125,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
&simp_idx_gen, &simp_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
d = to_defact(pc);
ret = alloc_defdata(d, defdata);
@@ -149,7 +149,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
return ret;
}
-static inline int tcf_simp_cleanup(struct tc_action *a, int bind)
+static int tcf_simp_cleanup(struct tc_action *a, int bind)
{
struct tcf_defact *d = a->priv;
@@ -158,8 +158,8 @@ static inline int tcf_simp_cleanup(struct tc_action *a, int bind)
return 0;
}
-static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
- int bind, int ref)
+static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_defact *d = a->priv;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 836f5fee9e5..5f6f0c7c390 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -113,7 +113,7 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
&skbedit_idx_gen, &skbedit_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
d = to_skbedit(pc);
ret = ACT_P_CREATED;
@@ -144,7 +144,7 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
return ret;
}
-static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
+static int tcf_skbedit_cleanup(struct tc_action *a, int bind)
{
struct tcf_skbedit *d = a->priv;
@@ -153,8 +153,8 @@ static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
return 0;
}
-static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
- int bind, int ref)
+static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_skbedit *d = a->priv;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 5fd0c28ef79..bb2c523f815 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -85,7 +85,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
int rc = -ENOENT;
write_lock(&cls_mod_lock);
- for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next)
+ for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
if (t == ops)
break;
@@ -111,7 +111,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
u32 first = TC_H_MAKE(0xC0000000U, 0U);
if (tp)
- first = tp->prio-1;
+ first = tp->prio - 1;
return first;
}
@@ -149,7 +149,8 @@ replay:
if (prio == 0) {
/* If no priority is given, user wants we allocated it. */
- if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+ if (n->nlmsg_type != RTM_NEWTFILTER ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
prio = TC_H_MAKE(0x80000000U, 0U);
}
@@ -176,7 +177,8 @@ replay:
}
/* Is it classful? */
- if ((cops = q->ops->cl_ops) == NULL)
+ cops = q->ops->cl_ops;
+ if (!cops)
return -EINVAL;
if (cops->tcf_chain == NULL)
@@ -196,10 +198,11 @@ replay:
goto errout;
/* Check the chain for existence of proto-tcf with this priority */
- for (back = chain; (tp=*back) != NULL; back = &tp->next) {
+ for (back = chain; (tp = *back) != NULL; back = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
- if (!nprio || (tp->protocol != protocol && protocol))
+ if (!nprio ||
+ (tp->protocol != protocol && protocol))
goto errout;
} else
tp = NULL;
@@ -216,7 +219,8 @@ replay:
goto errout;
err = -ENOENT;
- if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+ if (n->nlmsg_type != RTM_NEWTFILTER ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
goto errout;
@@ -420,7 +424,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
return skb->len;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return skb->len;
if (!tcm->tcm_parent)
@@ -429,7 +434,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
if (!q)
goto out;
- if ((cops = q->ops->cl_ops) == NULL)
+ cops = q->ops->cl_ops;
+ if (!cops)
goto errout;
if (cops->tcf_chain == NULL)
goto errout;
@@ -444,8 +450,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
s_t = cb->args[0];
- for (tp=*chain, t=0; tp; tp = tp->next, t++) {
- if (t < s_t) continue;
+ for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
+ if (t < s_t)
+ continue;
if (TC_H_MAJ(tcm->tcm_info) &&
TC_H_MAJ(tcm->tcm_info) != tp->prio)
continue;
@@ -468,10 +475,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
arg.skb = skb;
arg.cb = cb;
arg.w.stop = 0;
- arg.w.skip = cb->args[1]-1;
+ arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
tp->ops->walk(tp, &arg.w);
- cb->args[1] = arg.w.count+1;
+ cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
break;
}
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index f23d9155b1e..8be8872dd57 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -21,14 +21,12 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
-struct basic_head
-{
+struct basic_head {
u32 hgenerator;
struct list_head flist;
};
-struct basic_filter
-{
+struct basic_filter {
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
@@ -92,8 +90,7 @@ static int basic_init(struct tcf_proto *tp)
return 0;
}
-static inline void basic_delete_filter(struct tcf_proto *tp,
- struct basic_filter *f)
+static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
@@ -135,9 +132,9 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
[TCA_BASIC_EMATCHES] = { .type = NLA_NESTED },
};
-static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
- unsigned long base, struct nlattr **tb,
- struct nlattr *est)
+static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est)
{
int err = -EINVAL;
struct tcf_exts e;
@@ -203,7 +200,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
} while (--i > 0 && basic_get(tp, head->hgenerator));
if (i <= 0) {
- printk(KERN_ERR "Insufficient number of handles\n");
+ pr_err("Insufficient number of handles\n");
goto errout;
}
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index d49c40fb7e0..32a335194ca 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -56,7 +56,8 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
{
struct cgroup_cls_state *cs;
- if (!(cs = kzalloc(sizeof(*cs), GFP_KERNEL)))
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
return ERR_PTR(-ENOMEM);
if (cgrp->parent)
@@ -94,8 +95,7 @@ static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
}
-struct cls_cgroup_head
-{
+struct cls_cgroup_head {
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
@@ -166,7 +166,7 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
unsigned long *arg)
{
- struct nlattr *tb[TCA_CGROUP_MAX+1];
+ struct nlattr *tb[TCA_CGROUP_MAX + 1];
struct cls_cgroup_head *head = tp->root;
struct tcf_ematch_tree t;
struct tcf_exts e;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 5b271a18bc3..8ec01391d98 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -121,7 +121,7 @@ static u32 flow_get_proto_src(struct sk_buff *skb)
if (!pskb_network_may_pull(skb, sizeof(*iph)))
break;
iph = ip_hdr(skb);
- if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+ if (iph->frag_off & htons(IP_MF | IP_OFFSET))
break;
poff = proto_ports_offset(iph->protocol);
if (poff >= 0 &&
@@ -163,7 +163,7 @@ static u32 flow_get_proto_dst(struct sk_buff *skb)
if (!pskb_network_may_pull(skb, sizeof(*iph)))
break;
iph = ip_hdr(skb);
- if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+ if (iph->frag_off & htons(IP_MF | IP_OFFSET))
break;
poff = proto_ports_offset(iph->protocol);
if (poff >= 0 &&
@@ -276,7 +276,7 @@ fallback:
static u32 flow_get_rtclassid(const struct sk_buff *skb)
{
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (skb_dst(skb))
return skb_dst(skb)->tclassid;
#endif
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 93b0a7b6f9b..26e7bc4ffb7 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -31,14 +31,12 @@
#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
-struct fw_head
-{
+struct fw_head {
struct fw_filter *ht[HTSIZE];
u32 mask;
};
-struct fw_filter
-{
+struct fw_filter {
struct fw_filter *next;
u32 id;
struct tcf_result res;
@@ -53,7 +51,7 @@ static const struct tcf_ext_map fw_ext_map = {
.police = TCA_FW_POLICE
};
-static __inline__ int fw_hash(u32 handle)
+static inline int fw_hash(u32 handle)
{
if (HTSIZE == 4096)
return ((handle >> 24) & 0xFFF) ^
@@ -82,14 +80,14 @@ static __inline__ int fw_hash(u32 handle)
static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = (struct fw_head *)tp->root;
struct fw_filter *f;
int r;
u32 id = skb->mark;
if (head != NULL) {
id &= head->mask;
- for (f=head->ht[fw_hash(id)]; f; f=f->next) {
+ for (f = head->ht[fw_hash(id)]; f; f = f->next) {
if (f->id == id) {
*res = f->res;
#ifdef CONFIG_NET_CLS_IND
@@ -105,7 +103,8 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
}
} else {
/* old method */
- if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) {
+ if (id && (TC_H_MAJ(id) == 0 ||
+ !(TC_H_MAJ(id ^ tp->q->handle)))) {
res->classid = id;
res->class = 0;
return 0;
@@ -117,13 +116,13 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = (struct fw_head *)tp->root;
struct fw_filter *f;
if (head == NULL)
return 0;
- for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
+ for (f = head->ht[fw_hash(handle)]; f; f = f->next) {
if (f->id == handle)
return (unsigned long)f;
}
@@ -139,8 +138,7 @@ static int fw_init(struct tcf_proto *tp)
return 0;
}
-static inline void
-fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
+static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
@@ -156,8 +154,8 @@ static void fw_destroy(struct tcf_proto *tp)
if (head == NULL)
return;
- for (h=0; h<HTSIZE; h++) {
- while ((f=head->ht[h]) != NULL) {
+ for (h = 0; h < HTSIZE; h++) {
+ while ((f = head->ht[h]) != NULL) {
head->ht[h] = f->next;
fw_delete_filter(tp, f);
}
@@ -167,14 +165,14 @@ static void fw_destroy(struct tcf_proto *tp)
static int fw_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct fw_head *head = (struct fw_head*)tp->root;
- struct fw_filter *f = (struct fw_filter*)arg;
+ struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_filter *f = (struct fw_filter *)arg;
struct fw_filter **fp;
if (head == NULL || f == NULL)
goto out;
- for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
+ for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
@@ -240,7 +238,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,
struct nlattr **tca,
unsigned long *arg)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = (struct fw_head *)tp->root;
struct fw_filter *f = (struct fw_filter *) *arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_FW_MAX + 1];
@@ -302,7 +300,7 @@ errout:
static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = (struct fw_head *)tp->root;
int h;
if (head == NULL)
@@ -332,7 +330,7 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct fw_head *head = (struct fw_head *)tp->root;
- struct fw_filter *f = (struct fw_filter*)fh;
+ struct fw_filter *f = (struct fw_filter *)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 694dcd85dec..d580cdfca09 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -23,34 +23,30 @@
#include <net/pkt_cls.h>
/*
- 1. For now we assume that route tags < 256.
- It allows to use direct table lookups, instead of hash tables.
- 2. For now we assume that "from TAG" and "fromdev DEV" statements
- are mutually exclusive.
- 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ * 1. For now we assume that route tags < 256.
+ * It allows to use direct table lookups, instead of hash tables.
+ * 2. For now we assume that "from TAG" and "fromdev DEV" statements
+ * are mutually exclusive.
+ * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
*/
-struct route4_fastmap
-{
+struct route4_fastmap {
struct route4_filter *filter;
u32 id;
int iif;
};
-struct route4_head
-{
+struct route4_head {
struct route4_fastmap fastmap[16];
- struct route4_bucket *table[256+1];
+ struct route4_bucket *table[256 + 1];
};
-struct route4_bucket
-{
+struct route4_bucket {
/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
- struct route4_filter *ht[16+16+1];
+ struct route4_filter *ht[16 + 16 + 1];
};
-struct route4_filter
-{
+struct route4_filter {
struct route4_filter *next;
u32 id;
int iif;
@@ -61,20 +57,20 @@ struct route4_filter
struct route4_bucket *bkt;
};
-#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
+#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
static const struct tcf_ext_map route_ext_map = {
.police = TCA_ROUTE4_POLICE,
.action = TCA_ROUTE4_ACT
};
-static __inline__ int route4_fastmap_hash(u32 id, int iif)
+static inline int route4_fastmap_hash(u32 id, int iif)
{
- return id&0xF;
+ return id & 0xF;
}
-static inline
-void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
+static void
+route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
{
spinlock_t *root_lock = qdisc_root_sleeping_lock(q);
@@ -83,32 +79,33 @@ void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
spin_unlock_bh(root_lock);
}
-static inline void
+static void
route4_set_fastmap(struct route4_head *head, u32 id, int iif,
struct route4_filter *f)
{
int h = route4_fastmap_hash(id, iif);
+
head->fastmap[h].id = id;
head->fastmap[h].iif = iif;
head->fastmap[h].filter = f;
}
-static __inline__ int route4_hash_to(u32 id)
+static inline int route4_hash_to(u32 id)
{
- return id&0xFF;
+ return id & 0xFF;
}
-static __inline__ int route4_hash_from(u32 id)
+static inline int route4_hash_from(u32 id)
{
- return (id>>16)&0xF;
+ return (id >> 16) & 0xF;
}
-static __inline__ int route4_hash_iif(int iif)
+static inline int route4_hash_iif(int iif)
{
- return 16 + ((iif>>16)&0xF);
+ return 16 + ((iif >> 16) & 0xF);
}
-static __inline__ int route4_hash_wild(void)
+static inline int route4_hash_wild(void)
{
return 32;
}
@@ -131,21 +128,22 @@ static __inline__ int route4_hash_wild(void)
static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
- struct route4_head *head = (struct route4_head*)tp->root;
+ struct route4_head *head = (struct route4_head *)tp->root;
struct dst_entry *dst;
struct route4_bucket *b;
struct route4_filter *f;
u32 id, h;
int iif, dont_cache = 0;
- if ((dst = skb_dst(skb)) == NULL)
+ dst = skb_dst(skb);
+ if (!dst)
goto failure;
id = dst->tclassid;
if (head == NULL)
goto old_method;
- iif = ((struct rtable*)dst)->fl.iif;
+ iif = ((struct rtable *)dst)->fl.iif;
h = route4_fastmap_hash(id, iif);
if (id == head->fastmap[h].id &&
@@ -161,7 +159,8 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
h = route4_hash_to(id);
restart:
- if ((b = head->table[h]) != NULL) {
+ b = head->table[h];
+ if (b) {
for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
if (f->id == id)
ROUTE4_APPLY_RESULT();
@@ -197,8 +196,9 @@ old_method:
static inline u32 to_hash(u32 id)
{
- u32 h = id&0xFF;
- if (id&0x8000)
+ u32 h = id & 0xFF;
+
+ if (id & 0x8000)
h += 256;
return h;
}
@@ -211,17 +211,17 @@ static inline u32 from_hash(u32 id)
if (!(id & 0x8000)) {
if (id > 255)
return 256;
- return id&0xF;
+ return id & 0xF;
}
- return 16 + (id&0xF);
+ return 16 + (id & 0xF);
}
static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
{
- struct route4_head *head = (struct route4_head*)tp->root;
+ struct route4_head *head = (struct route4_head *)tp->root;
struct route4_bucket *b;
struct route4_filter *f;
- unsigned h1, h2;
+ unsigned int h1, h2;
if (!head)
return 0;
@@ -230,11 +230,12 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
if (h1 > 256)
return 0;
- h2 = from_hash(handle>>16);
+ h2 = from_hash(handle >> 16);
if (h2 > 32)
return 0;
- if ((b = head->table[h1]) != NULL) {
+ b = head->table[h1];
+ if (b) {
for (f = b->ht[h2]; f; f = f->next)
if (f->handle == handle)
return (unsigned long)f;
@@ -251,7 +252,7 @@ static int route4_init(struct tcf_proto *tp)
return 0;
}
-static inline void
+static void
route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
{
tcf_unbind_filter(tp, &f->res);
@@ -267,11 +268,12 @@ static void route4_destroy(struct tcf_proto *tp)
if (head == NULL)
return;
- for (h1=0; h1<=256; h1++) {
+ for (h1 = 0; h1 <= 256; h1++) {
struct route4_bucket *b;
- if ((b = head->table[h1]) != NULL) {
- for (h2=0; h2<=32; h2++) {
+ b = head->table[h1];
+ if (b) {
+ for (h2 = 0; h2 <= 32; h2++) {
struct route4_filter *f;
while ((f = b->ht[h2]) != NULL) {
@@ -287,9 +289,9 @@ static void route4_destroy(struct tcf_proto *tp)
static int route4_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct route4_head *head = (struct route4_head*)tp->root;
- struct route4_filter **fp, *f = (struct route4_filter*)arg;
- unsigned h = 0;
+ struct route4_head *head = (struct route4_head *)tp->root;
+ struct route4_filter **fp, *f = (struct route4_filter *)arg;
+ unsigned int h = 0;
struct route4_bucket *b;
int i;
@@ -299,7 +301,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
h = f->handle;
b = f->bkt;
- for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
+ for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
@@ -310,7 +312,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
/* Strip tree */
- for (i=0; i<=32; i++)
+ for (i = 0; i <= 32; i++)
if (b->ht[i])
return 0;
@@ -380,7 +382,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
}
h1 = to_hash(nhandle);
- if ((b = head->table[h1]) == NULL) {
+ b = head->table[h1];
+ if (!b) {
err = -ENOBUFS;
b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
if (b == NULL)
@@ -391,6 +394,7 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
tcf_tree_unlock(tp);
} else {
unsigned int h2 = from_hash(nhandle >> 16);
+
err = -EEXIST;
for (fp = b->ht[h2]; fp; fp = fp->next)
if (fp->handle == f->handle)
@@ -444,7 +448,8 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
if (err < 0)
return err;
- if ((f = (struct route4_filter*)*arg) != NULL) {
+ f = (struct route4_filter *)*arg;
+ if (f) {
if (f->handle != handle && handle)
return -EINVAL;
@@ -481,7 +486,7 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
reinsert:
h = from_hash(f->handle >> 16);
- for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next)
+ for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next)
if (f->handle < f1->handle)
break;
@@ -492,7 +497,8 @@ reinsert:
if (old_handle && f->handle != old_handle) {
th = to_hash(old_handle);
h = from_hash(old_handle >> 16);
- if ((b = head->table[th]) != NULL) {
+ b = head->table[th];
+ if (b) {
for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
*fp = f->next;
@@ -515,7 +521,7 @@ errout:
static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct route4_head *head = tp->root;
- unsigned h, h1;
+ unsigned int h, h1;
if (head == NULL)
arg->stop = 1;
@@ -549,7 +555,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
static int route4_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct route4_filter *f = (struct route4_filter*)fh;
+ struct route4_filter *f = (struct route4_filter *)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
u32 id;
@@ -563,15 +569,15 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh,
if (nest == NULL)
goto nla_put_failure;
- if (!(f->handle&0x8000)) {
- id = f->id&0xFF;
+ if (!(f->handle & 0x8000)) {
+ id = f->id & 0xFF;
NLA_PUT_U32(skb, TCA_ROUTE4_TO, id);
}
- if (f->handle&0x80000000) {
- if ((f->handle>>16) != 0xFFFF)
+ if (f->handle & 0x80000000) {
+ if ((f->handle >> 16) != 0xFFFF)
NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif);
} else {
- id = f->id>>16;
+ id = f->id >> 16;
NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id);
}
if (f->res.classid)
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 425a1790b04..402c44b241a 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -66,28 +66,25 @@
powerful classification engine. */
-struct rsvp_head
-{
+struct rsvp_head {
u32 tmap[256/32];
u32 hgenerator;
u8 tgenerator;
struct rsvp_session *ht[256];
};
-struct rsvp_session
-{
+struct rsvp_session {
struct rsvp_session *next;
__be32 dst[RSVP_DST_LEN];
struct tc_rsvp_gpi dpi;
u8 protocol;
u8 tunnelid;
/* 16 (src,sport) hash slots, and one wildcard source slot */
- struct rsvp_filter *ht[16+1];
+ struct rsvp_filter *ht[16 + 1];
};
-struct rsvp_filter
-{
+struct rsvp_filter {
struct rsvp_filter *next;
__be32 src[RSVP_DST_LEN];
struct tc_rsvp_gpi spi;
@@ -100,17 +97,19 @@ struct rsvp_filter
struct rsvp_session *sess;
};
-static __inline__ unsigned hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
+static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
{
- unsigned h = (__force __u32)dst[RSVP_DST_LEN-1];
+ unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
+
h ^= h>>16;
h ^= h>>8;
return (h ^ protocol ^ tunnelid) & 0xFF;
}
-static __inline__ unsigned hash_src(__be32 *src)
+static inline unsigned int hash_src(__be32 *src)
{
- unsigned h = (__force __u32)src[RSVP_DST_LEN-1];
+ unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
+
h ^= h>>16;
h ^= h>>8;
h ^= h>>4;
@@ -134,10 +133,10 @@ static struct tcf_ext_map rsvp_ext_map = {
static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
- struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+ struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
- unsigned h1, h2;
+ unsigned int h1, h2;
__be32 *dst, *src;
u8 protocol;
u8 tunnelid = 0;
@@ -162,13 +161,13 @@ restart:
src = &nhptr->saddr.s6_addr32[0];
dst = &nhptr->daddr.s6_addr32[0];
protocol = nhptr->nexthdr;
- xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr);
+ xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
#else
src = &nhptr->saddr;
dst = &nhptr->daddr;
protocol = nhptr->protocol;
- xprt = ((u8*)nhptr) + (nhptr->ihl<<2);
- if (nhptr->frag_off & htons(IP_MF|IP_OFFSET))
+ xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
+ if (nhptr->frag_off & htons(IP_MF | IP_OFFSET))
return -1;
#endif
@@ -176,10 +175,10 @@ restart:
h2 = hash_src(src);
for (s = sht[h1]; s; s = s->next) {
- if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
+ if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
protocol == s->protocol &&
!(s->dpi.mask &
- (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) &&
+ (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
#if RSVP_DST_LEN == 4
dst[0] == s->dst[0] &&
dst[1] == s->dst[1] &&
@@ -188,8 +187,8 @@ restart:
tunnelid == s->tunnelid) {
for (f = s->ht[h2]; f; f = f->next) {
- if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] &&
- !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key))
+ if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
+ !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
#if RSVP_DST_LEN == 4
&&
src[0] == f->src[0] &&
@@ -205,7 +204,7 @@ matched:
return 0;
tunnelid = f->res.classid;
- nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr));
+ nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
goto restart;
}
}
@@ -224,11 +223,11 @@ matched:
static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
{
- struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+ struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
- unsigned h1 = handle&0xFF;
- unsigned h2 = (handle>>8)&0xFF;
+ unsigned int h1 = handle & 0xFF;
+ unsigned int h2 = (handle >> 8) & 0xFF;
if (h2 > 16)
return 0;
@@ -258,7 +257,7 @@ static int rsvp_init(struct tcf_proto *tp)
return -ENOBUFS;
}
-static inline void
+static void
rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
{
tcf_unbind_filter(tp, &f->res);
@@ -277,13 +276,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
sht = data->ht;
- for (h1=0; h1<256; h1++) {
+ for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
while ((s = sht[h1]) != NULL) {
sht[h1] = s->next;
- for (h2=0; h2<=16; h2++) {
+ for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
while ((f = s->ht[h2]) != NULL) {
@@ -299,13 +298,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg;
- unsigned h = f->handle;
+ struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg;
+ unsigned int h = f->handle;
struct rsvp_session **sp;
struct rsvp_session *s = f->sess;
int i;
- for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) {
+ for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
@@ -314,12 +313,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
/* Strip tree */
- for (i=0; i<=16; i++)
+ for (i = 0; i <= 16; i++)
if (s->ht[i])
return 0;
/* OK, session has no flows */
- for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
+ for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF];
*sp; sp = &(*sp)->next) {
if (*sp == s) {
tcf_tree_lock(tp);
@@ -337,13 +336,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
return 0;
}
-static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
+static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
{
struct rsvp_head *data = tp->root;
int i = 0xFFFF;
while (i-- > 0) {
u32 h;
+
if ((data->hgenerator += 0x10000) == 0)
data->hgenerator = 0x10000;
h = data->hgenerator|salt;
@@ -355,10 +355,10 @@ static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
static int tunnel_bts(struct rsvp_head *data)
{
- int n = data->tgenerator>>5;
- u32 b = 1<<(data->tgenerator&0x1F);
+ int n = data->tgenerator >> 5;
+ u32 b = 1 << (data->tgenerator & 0x1F);
- if (data->tmap[n]&b)
+ if (data->tmap[n] & b)
return 0;
data->tmap[n] |= b;
return 1;
@@ -372,10 +372,10 @@ static void tunnel_recycle(struct rsvp_head *data)
memset(tmap, 0, sizeof(tmap));
- for (h1=0; h1<256; h1++) {
+ for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
for (s = sht[h1]; s; s = s->next) {
- for (h2=0; h2<=16; h2++) {
+ for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
for (f = s->ht[h2]; f; f = f->next) {
@@ -395,8 +395,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
{
int i, k;
- for (k=0; k<2; k++) {
- for (i=255; i>0; i--) {
+ for (k = 0; k < 2; k++) {
+ for (i = 255; i > 0; i--) {
if (++data->tgenerator == 0)
data->tgenerator = 1;
if (tunnel_bts(data))
@@ -428,7 +428,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
struct nlattr *opt = tca[TCA_OPTIONS-1];
struct nlattr *tb[TCA_RSVP_MAX + 1];
struct tcf_exts e;
- unsigned h1, h2;
+ unsigned int h1, h2;
__be32 *dst;
int err;
@@ -443,7 +443,8 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
if (err < 0)
return err;
- if ((f = (struct rsvp_filter*)*arg) != NULL) {
+ f = (struct rsvp_filter *)*arg;
+ if (f) {
/* Node exists: adjust only classid */
if (f->handle != handle && handle)
@@ -500,7 +501,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
goto errout;
}
- for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) {
+ for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
pinfo && pinfo->protocol == s->protocol &&
memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
@@ -523,7 +524,7 @@ insert:
tcf_exts_change(tp, &f->exts, &e);
for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
- if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
+ if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask)
break;
f->next = *fp;
wmb();
@@ -567,7 +568,7 @@ errout2:
static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct rsvp_head *head = tp->root;
- unsigned h, h1;
+ unsigned int h, h1;
if (arg->stop)
return;
@@ -598,7 +599,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct rsvp_filter *f = (struct rsvp_filter*)fh;
+ struct rsvp_filter *f = (struct rsvp_filter *)fh;
struct rsvp_session *s;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
@@ -624,7 +625,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
if (f->res.classid)
NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid);
- if (((f->handle>>8)&0xFF) != 16)
+ if (((f->handle >> 8) & 0xFF) != 16)
NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 20ef330bb91..36667fa6423 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -249,7 +249,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
* of the hashing index is below the threshold.
*/
if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
- cp.hash = (cp.mask >> cp.shift)+1;
+ cp.hash = (cp.mask >> cp.shift) + 1;
else
cp.hash = DEFAULT_HASH_SIZE;
}
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b0c2a82178a..966920c14e7 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -42,8 +42,7 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
-struct tc_u_knode
-{
+struct tc_u_knode {
struct tc_u_knode *next;
u32 handle;
struct tc_u_hnode *ht_up;
@@ -63,19 +62,17 @@ struct tc_u_knode
struct tc_u32_sel sel;
};
-struct tc_u_hnode
-{
+struct tc_u_hnode {
struct tc_u_hnode *next;
u32 handle;
u32 prio;
struct tc_u_common *tp_c;
int refcnt;
- unsigned divisor;
+ unsigned int divisor;
struct tc_u_knode *ht[1];
};
-struct tc_u_common
-{
+struct tc_u_common {
struct tc_u_hnode *hlist;
struct Qdisc *q;
int refcnt;
@@ -87,9 +84,11 @@ static const struct tcf_ext_map u32_ext_map = {
.police = TCA_U32_POLICE
};
-static __inline__ unsigned u32_hash_fold(__be32 key, struct tc_u32_sel *sel, u8 fshift)
+static inline unsigned int u32_hash_fold(__be32 key,
+ const struct tc_u32_sel *sel,
+ u8 fshift)
{
- unsigned h = ntohl(key & sel->hmask)>>fshift;
+ unsigned int h = ntohl(key & sel->hmask) >> fshift;
return h;
}
@@ -101,7 +100,7 @@ static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_re
unsigned int off;
} stack[TC_U32_MAXDEPTH];
- struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root;
unsigned int off = skb_network_offset(skb);
struct tc_u_knode *n;
int sdepth = 0;
@@ -120,7 +119,7 @@ next_knode:
struct tc_u32_key *key = n->sel.keys;
#ifdef CONFIG_CLS_U32_PERF
- n->pf->rcnt +=1;
+ n->pf->rcnt += 1;
j = 0;
#endif
@@ -133,7 +132,7 @@ next_knode:
}
#endif
- for (i = n->sel.nkeys; i>0; i--, key++) {
+ for (i = n->sel.nkeys; i > 0; i--, key++) {
int toff = off + key->off + (off2 & key->offmask);
__be32 *data, _data;
@@ -148,13 +147,13 @@ next_knode:
goto next_knode;
}
#ifdef CONFIG_CLS_U32_PERF
- n->pf->kcnts[j] +=1;
+ n->pf->kcnts[j] += 1;
j++;
#endif
}
if (n->ht_down == NULL) {
check_terminal:
- if (n->sel.flags&TC_U32_TERMINAL) {
+ if (n->sel.flags & TC_U32_TERMINAL) {
*res = n->res;
#ifdef CONFIG_NET_CLS_IND
@@ -164,7 +163,7 @@ check_terminal:
}
#endif
#ifdef CONFIG_CLS_U32_PERF
- n->pf->rhit +=1;
+ n->pf->rhit += 1;
#endif
r = tcf_exts_exec(skb, &n->exts, res);
if (r < 0) {
@@ -197,10 +196,10 @@ check_terminal:
sel = ht->divisor & u32_hash_fold(*data, &n->sel,
n->fshift);
}
- if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
+ if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
goto next_ht;
- if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
+ if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
off2 = n->sel.off + 3;
if (n->sel.flags & TC_U32_VAROFFSET) {
__be16 *data, _data;
@@ -215,7 +214,7 @@ check_terminal:
}
off2 &= ~3;
}
- if (n->sel.flags&TC_U32_EAT) {
+ if (n->sel.flags & TC_U32_EAT) {
off += off2;
off2 = 0;
}
@@ -236,11 +235,11 @@ out:
deadloop:
if (net_ratelimit())
- printk(KERN_WARNING "cls_u32: dead loop\n");
+ pr_warning("cls_u32: dead loop\n");
return -1;
}
-static __inline__ struct tc_u_hnode *
+static struct tc_u_hnode *
u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
struct tc_u_hnode *ht;
@@ -252,10 +251,10 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
return ht;
}
-static __inline__ struct tc_u_knode *
+static struct tc_u_knode *
u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
- unsigned sel;
+ unsigned int sel;
struct tc_u_knode *n = NULL;
sel = TC_U32_HASH(handle);
@@ -300,7 +299,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c)
do {
if (++tp_c->hgenerator == 0x7FF)
tp_c->hgenerator = 1;
- } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
+ } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
}
@@ -378,9 +377,9 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
struct tc_u_knode *n;
- unsigned h;
+ unsigned int h;
- for (h=0; h<=ht->divisor; h++) {
+ for (h = 0; h <= ht->divisor; h++) {
while ((n = ht->ht[h]) != NULL) {
ht->ht[h] = n->next;
@@ -446,13 +445,13 @@ static void u32_destroy(struct tcf_proto *tp)
static int u32_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct tc_u_hnode *ht = (struct tc_u_hnode*)arg;
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
if (ht == NULL)
return 0;
if (TC_U32_KEY(ht->handle))
- return u32_delete_key(tp, (struct tc_u_knode*)ht);
+ return u32_delete_key(tp, (struct tc_u_knode *)ht);
if (tp->root == ht)
return -EINVAL;
@@ -470,14 +469,14 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
{
struct tc_u_knode *n;
- unsigned i = 0x7FF;
+ unsigned int i = 0x7FF;
- for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
+ for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
if (i < TC_U32_NODE(n->handle))
i = TC_U32_NODE(n->handle);
i++;
- return handle|(i>0xFFF ? 0xFFF : i);
+ return handle | (i > 0xFFF ? 0xFFF : i);
}
static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
@@ -566,7 +565,8 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
if (err < 0)
return err;
- if ((n = (struct tc_u_knode*)*arg) != NULL) {
+ n = (struct tc_u_knode *)*arg;
+ if (n) {
if (TC_U32_KEY(n->handle) == 0)
return -EINVAL;
@@ -574,7 +574,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
}
if (tb[TCA_U32_DIVISOR]) {
- unsigned divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
+ unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
if (--divisor > 0x100)
return -EINVAL;
@@ -585,7 +585,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
if (handle == 0)
return -ENOMEM;
}
- ht = kzalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL);
+ ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
if (ht == NULL)
return -ENOBUFS;
ht->tp_c = tp_c;
@@ -683,7 +683,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
struct tc_u_knode *n;
- unsigned h;
+ unsigned int h;
if (arg->stop)
return;
@@ -717,7 +717,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
static int u32_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct tc_u_knode *n = (struct tc_u_knode*)fh;
+ struct tc_u_knode *n = (struct tc_u_knode *)fh;
struct nlattr *nest;
if (n == NULL)
@@ -730,8 +730,9 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
if (TC_U32_KEY(n->handle) == 0) {
- struct tc_u_hnode *ht = (struct tc_u_hnode*)fh;
- u32 divisor = ht->divisor+1;
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
+ u32 divisor = ht->divisor + 1;
+
NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor);
} else {
NLA_PUT(skb, TCA_U32_SEL,
@@ -755,7 +756,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
#ifdef CONFIG_NET_CLS_IND
- if(strlen(n->indev))
+ if (strlen(n->indev))
NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev);
#endif
#ifdef CONFIG_CLS_U32_PERF
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index bc450397487..1c8360a2752 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -33,40 +33,41 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
return 0;
switch (cmp->align) {
- case TCF_EM_ALIGN_U8:
- val = *ptr;
- break;
+ case TCF_EM_ALIGN_U8:
+ val = *ptr;
+ break;
- case TCF_EM_ALIGN_U16:
- val = get_unaligned_be16(ptr);
+ case TCF_EM_ALIGN_U16:
+ val = get_unaligned_be16(ptr);
- if (cmp_needs_transformation(cmp))
- val = be16_to_cpu(val);
- break;
+ if (cmp_needs_transformation(cmp))
+ val = be16_to_cpu(val);
+ break;
- case TCF_EM_ALIGN_U32:
- /* Worth checking boundries? The branching seems
- * to get worse. Visit again. */
- val = get_unaligned_be32(ptr);
+ case TCF_EM_ALIGN_U32:
+ /* Worth checking boundries? The branching seems
+ * to get worse. Visit again.
+ */
+ val = get_unaligned_be32(ptr);
- if (cmp_needs_transformation(cmp))
- val = be32_to_cpu(val);
- break;
+ if (cmp_needs_transformation(cmp))
+ val = be32_to_cpu(val);
+ break;
- default:
- return 0;
+ default:
+ return 0;
}
if (cmp->mask)
val &= cmp->mask;
switch (cmp->opnd) {
- case TCF_EM_OPND_EQ:
- return val == cmp->val;
- case TCF_EM_OPND_LT:
- return val < cmp->val;
- case TCF_EM_OPND_GT:
- return val > cmp->val;
+ case TCF_EM_OPND_EQ:
+ return val == cmp->val;
+ case TCF_EM_OPND_LT:
+ return val < cmp->val;
+ case TCF_EM_OPND_GT:
+ return val > cmp->val;
}
return 0;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 34da5e29ea1..a889d099320 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -73,21 +73,18 @@
#include <net/pkt_cls.h>
#include <net/sock.h>
-struct meta_obj
-{
+struct meta_obj {
unsigned long value;
unsigned int len;
};
-struct meta_value
-{
+struct meta_value {
struct tcf_meta_val hdr;
unsigned long val;
unsigned int len;
};
-struct meta_match
-{
+struct meta_match {
struct meta_value lvalue;
struct meta_value rvalue;
};
@@ -255,7 +252,7 @@ META_COLLECTOR(int_rtclassid)
if (unlikely(skb_dst(skb) == NULL))
*err = -1;
else
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
dst->value = skb_dst(skb)->tclassid;
#else
dst->value = 0;
@@ -483,8 +480,7 @@ META_COLLECTOR(int_sk_write_pend)
* Meta value collectors assignment table
**************************************************************************/
-struct meta_ops
-{
+struct meta_ops {
void (*get)(struct sk_buff *, struct tcf_pkt_info *,
struct meta_value *, struct meta_obj *, int *);
};
@@ -494,7 +490,7 @@ struct meta_ops
/* Meta value operations table listing all meta value collectors and
* assigns them to a type and meta id. */
-static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
[TCF_META_TYPE_VAR] = {
[META_ID(DEV)] = META_FUNC(var_dev),
[META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
@@ -550,7 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
}
};
-static inline struct meta_ops * meta_ops(struct meta_value *val)
+static inline struct meta_ops *meta_ops(struct meta_value *val)
{
return &__meta_ops[meta_type(val)][meta_id(val)];
}
@@ -649,9 +645,8 @@ static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
{
if (v->len == sizeof(unsigned long))
NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
- else if (v->len == sizeof(u32)) {
+ else if (v->len == sizeof(u32))
NLA_PUT_U32(skb, tlv, v->val);
- }
return 0;
@@ -663,8 +658,7 @@ nla_put_failure:
* Type specific operations table
**************************************************************************/
-struct meta_type_ops
-{
+struct meta_type_ops {
void (*destroy)(struct meta_value *);
int (*compare)(struct meta_obj *, struct meta_obj *);
int (*change)(struct meta_value *, struct nlattr *);
@@ -672,7 +666,7 @@ struct meta_type_ops
int (*dump)(struct sk_buff *, struct meta_value *, int);
};
-static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
+static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
[TCF_META_TYPE_VAR] = {
.destroy = meta_var_destroy,
.compare = meta_var_compare,
@@ -688,7 +682,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
}
};
-static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
+static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
{
return &__meta_type_ops[meta_type(v)];
}
@@ -713,7 +707,7 @@ static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
return err;
if (meta_type_ops(v)->apply_extras)
- meta_type_ops(v)->apply_extras(v, dst);
+ meta_type_ops(v)->apply_extras(v, dst);
return 0;
}
@@ -732,12 +726,12 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
switch (meta->lvalue.hdr.op) {
- case TCF_EM_OPND_EQ:
- return !r;
- case TCF_EM_OPND_LT:
- return r < 0;
- case TCF_EM_OPND_GT:
- return r > 0;
+ case TCF_EM_OPND_EQ:
+ return !r;
+ case TCF_EM_OPND_LT:
+ return r < 0;
+ case TCF_EM_OPND_GT:
+ return r > 0;
}
return 0;
@@ -771,7 +765,7 @@ static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
static inline int meta_is_supported(struct meta_value *val)
{
- return (!meta_id(val) || meta_ops(val)->get);
+ return !meta_id(val) || meta_ops(val)->get;
}
static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 1a4176aee6e..a3bed07a008 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -18,8 +18,7 @@
#include <linux/tc_ematch/tc_em_nbyte.h>
#include <net/pkt_cls.h>
-struct nbyte_data
-{
+struct nbyte_data {
struct tcf_em_nbyte hdr;
char pattern[0];
};
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index ea8f566e720..15d353d2e4b 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -19,8 +19,7 @@
#include <linux/tc_ematch/tc_em_text.h>
#include <net/pkt_cls.h>
-struct text_match
-{
+struct text_match {
u16 from_offset;
u16 to_offset;
u8 from_layer;
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
index 953f1479f7d..797bdb88c01 100644
--- a/net/sched/em_u32.c
+++ b/net/sched/em_u32.c
@@ -35,7 +35,7 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
return 0;
- return !(((*(__be32*) ptr) ^ key->val) & key->mask);
+ return !(((*(__be32 *) ptr) ^ key->val) & key->mask);
}
static struct tcf_ematch_ops em_u32_ops = {
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 5e37da961f8..88d93eb9250 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -93,7 +93,7 @@
static LIST_HEAD(ematch_ops);
static DEFINE_RWLOCK(ematch_mod_lock);
-static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind)
+static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
{
struct tcf_ematch_ops *e = NULL;
@@ -163,8 +163,8 @@ void tcf_em_unregister(struct tcf_ematch_ops *ops)
}
EXPORT_SYMBOL(tcf_em_unregister);
-static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree,
- int index)
+static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
+ int index)
{
return &tree->matches[index];
}
@@ -184,7 +184,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
if (em_hdr->kind == TCF_EM_CONTAINER) {
/* Special ematch called "container", carries an index
- * referencing an external ematch sequence. */
+ * referencing an external ematch sequence.
+ */
u32 ref;
if (data_len < sizeof(ref))
@@ -195,7 +196,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
goto errout;
/* We do not allow backward jumps to avoid loops and jumps
- * to our own position are of course illegal. */
+ * to our own position are of course illegal.
+ */
if (ref <= idx)
goto errout;
@@ -208,7 +210,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
* which automatically releases the reference again, therefore
* the module MUST not be given back under any circumstances
* here. Be aware, the destroy function assumes that the
- * module is held if the ops field is non zero. */
+ * module is held if the ops field is non zero.
+ */
em->ops = tcf_em_lookup(em_hdr->kind);
if (em->ops == NULL) {
@@ -221,7 +224,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
if (em->ops) {
/* We dropped the RTNL mutex in order to
* perform the module load. Tell the caller
- * to replay the request. */
+ * to replay the request.
+ */
module_put(em->ops->owner);
err = -EAGAIN;
}
@@ -230,7 +234,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
}
/* ematch module provides expected length of data, so we
- * can do a basic sanity check. */
+ * can do a basic sanity check.
+ */
if (em->ops->datalen && data_len < em->ops->datalen)
goto errout;
@@ -246,7 +251,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
* TCF_EM_SIMPLE may be specified stating that the
* data only consists of a u32 integer and the module
* does not expected a memory reference but rather
- * the value carried. */
+ * the value carried.
+ */
if (em_hdr->flags & TCF_EM_SIMPLE) {
if (data_len < sizeof(u32))
goto errout;
@@ -334,7 +340,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
* The array of rt attributes is parsed in the order as they are
* provided, their type must be incremental from 1 to n. Even
* if it does not serve any real purpose, a failure of sticking
- * to this policy will result in parsing failure. */
+ * to this policy will result in parsing failure.
+ */
for (idx = 0; nla_ok(rt_match, list_len); idx++) {
err = -EINVAL;
@@ -359,7 +366,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
/* Check if the number of matches provided by userspace actually
* complies with the array of matches. The number was used for
* the validation of references and a mismatch could lead to
- * undefined references during the matching process. */
+ * undefined references during the matching process.
+ */
if (idx != tree_hdr->nmatches) {
err = -EINVAL;
goto errout_abort;
@@ -449,7 +457,7 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
.flags = em->flags
};
- NLA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr);
+ NLA_PUT(skb, i + 1, sizeof(em_hdr), &em_hdr);
if (em->ops && em->ops->dump) {
if (em->ops->dump(skb, em) < 0)
@@ -478,6 +486,7 @@ static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
int r = em->ops->match(skb, em, info);
+
return tcf_em_is_inverted(em) ? !r : r;
}
@@ -527,8 +536,8 @@ pop_stack:
stack_overflow:
if (net_ratelimit())
- printk(KERN_WARNING "tc ematch: local stack overflow,"
- " increase NET_EMATCH_STACK\n");
+ pr_warning("tc ematch: local stack overflow,"
+ " increase NET_EMATCH_STACK\n");
return -1;
}
EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b22ca2d1ceb..15074157940 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -187,7 +187,7 @@ int unregister_qdisc(struct Qdisc_ops *qops)
int err = -ENOENT;
write_lock(&qdisc_mod_lock);
- for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
+ for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
if (q == qops)
break;
if (q) {
@@ -321,7 +321,9 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
if (!tab || --tab->refcnt)
return;
- for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
+ for (rtabp = &qdisc_rtab_list;
+ (rtab = *rtabp) != NULL;
+ rtabp = &rtab->next) {
if (rtab == tab) {
*rtabp = rtab->next;
kfree(rtab);
@@ -396,6 +398,11 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
return stab;
}
+static void stab_kfree_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct qdisc_size_table, rcu));
+}
+
void qdisc_put_stab(struct qdisc_size_table *tab)
{
if (!tab)
@@ -405,7 +412,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
if (--tab->refcnt == 0) {
list_del(&tab->list);
- kfree(tab);
+ call_rcu_bh(&tab->rcu, stab_kfree_rcu);
}
spin_unlock(&qdisc_stab_lock);
@@ -428,7 +435,7 @@ nla_put_failure:
return -1;
}
-void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
+void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
{
int pkt_len, slot;
@@ -454,14 +461,13 @@ out:
pkt_len = 1;
qdisc_skb_cb(skb)->pkt_len = pkt_len;
}
-EXPORT_SYMBOL(qdisc_calculate_pkt_len);
+EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
{
if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
- printk(KERN_WARNING
- "%s: %s qdisc %X: is non-work-conserving?\n",
- txt, qdisc->ops->id, qdisc->handle >> 16);
+ pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
+ txt, qdisc->ops->id, qdisc->handle >> 16);
qdisc->flags |= TCQ_F_WARN_NONWC;
}
}
@@ -472,7 +478,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
timer);
- wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(wd->qdisc);
__netif_schedule(qdisc_root(wd->qdisc));
return HRTIMER_NORESTART;
@@ -494,7 +500,7 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
&qdisc_root_sleeping(wd->qdisc)->state))
return;
- wd->qdisc->flags |= TCQ_F_THROTTLED;
+ qdisc_throttled(wd->qdisc);
time = ktime_set(0, 0);
time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
@@ -504,7 +510,7 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule);
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
hrtimer_cancel(&wd->timer);
- wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(wd->qdisc);
}
EXPORT_SYMBOL(qdisc_watchdog_cancel);
@@ -625,7 +631,7 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
autohandle = TC_H_MAKE(0x80000000U, 0);
} while (qdisc_lookup(dev, autohandle) && --i > 0);
- return i>0 ? autohandle : 0;
+ return i > 0 ? autohandle : 0;
}
void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
@@ -834,7 +840,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
err = PTR_ERR(stab);
goto err_out4;
}
- sch->stab = stab;
+ rcu_assign_pointer(sch->stab, stab);
}
if (tca[TCA_RATE]) {
spinlock_t *root_lock;
@@ -874,7 +880,7 @@ err_out4:
* Any broken qdiscs that would require a ops->reset() here?
* The qdisc was never in action so it shouldn't be necessary.
*/
- qdisc_put_stab(sch->stab);
+ qdisc_put_stab(rtnl_dereference(sch->stab));
if (ops->destroy)
ops->destroy(sch);
goto err_out3;
@@ -882,7 +888,7 @@ err_out4:
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
{
- struct qdisc_size_table *stab = NULL;
+ struct qdisc_size_table *ostab, *stab = NULL;
int err = 0;
if (tca[TCA_OPTIONS]) {
@@ -899,8 +905,9 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
return PTR_ERR(stab);
}
- qdisc_put_stab(sch->stab);
- sch->stab = stab;
+ ostab = rtnl_dereference(sch->stab);
+ rcu_assign_pointer(sch->stab, stab);
+ qdisc_put_stab(ostab);
if (tca[TCA_RATE]) {
/* NB: ignores errors from replace_estimator
@@ -915,9 +922,8 @@ out:
return 0;
}
-struct check_loop_arg
-{
- struct qdisc_walker w;
+struct check_loop_arg {
+ struct qdisc_walker w;
struct Qdisc *p;
int depth;
};
@@ -970,7 +976,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
struct Qdisc *p = NULL;
int err;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return -ENODEV;
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -980,12 +987,12 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if (clid) {
if (clid != TC_H_ROOT) {
if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
- if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p)
return -ENOENT;
q = qdisc_leaf(p, clid);
- } else { /* ingress */
- if (dev_ingress_queue(dev))
- q = dev_ingress_queue(dev)->qdisc_sleeping;
+ } else if (dev_ingress_queue(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
q = dev->qdisc;
@@ -996,7 +1003,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
return -EINVAL;
} else {
- if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q)
return -ENOENT;
}
@@ -1008,7 +1016,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
return -EINVAL;
if (q->handle == 0)
return -ENOENT;
- if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
+ err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
+ if (err != 0)
return err;
} else {
qdisc_notify(net, skb, n, clid, NULL, q);
@@ -1017,7 +1026,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
}
/*
- Create/change qdisc.
+ * Create/change qdisc.
*/
static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
@@ -1036,7 +1045,8 @@ replay:
clid = tcm->tcm_parent;
q = p = NULL;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return -ENODEV;
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -1046,12 +1056,12 @@ replay:
if (clid) {
if (clid != TC_H_ROOT) {
if (clid != TC_H_INGRESS) {
- if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p)
return -ENOENT;
q = qdisc_leaf(p, clid);
- } else { /* ingress */
- if (dev_ingress_queue_create(dev))
- q = dev_ingress_queue(dev)->qdisc_sleeping;
+ } else if (dev_ingress_queue_create(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
q = dev->qdisc;
@@ -1063,13 +1073,14 @@ replay:
if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
if (tcm->tcm_handle) {
- if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
+ if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
if (TC_H_MIN(tcm->tcm_handle))
return -EINVAL;
- if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q)
goto create_n_graft;
- if (n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
@@ -1079,7 +1090,7 @@ replay:
atomic_inc(&q->refcnt);
goto graft;
} else {
- if (q == NULL)
+ if (!q)
goto create_n_graft;
/* This magic test requires explanation.
@@ -1101,9 +1112,9 @@ replay:
* For now we select create/graft, if
* user gave KIND, which does not match existing.
*/
- if ((n->nlmsg_flags&NLM_F_CREATE) &&
- (n->nlmsg_flags&NLM_F_REPLACE) &&
- ((n->nlmsg_flags&NLM_F_EXCL) ||
+ if ((n->nlmsg_flags & NLM_F_CREATE) &&
+ (n->nlmsg_flags & NLM_F_REPLACE) &&
+ ((n->nlmsg_flags & NLM_F_EXCL) ||
(tca[TCA_KIND] &&
nla_strcmp(tca[TCA_KIND], q->ops->id))))
goto create_n_graft;
@@ -1118,7 +1129,7 @@ replay:
/* Change qdisc parameters */
if (q == NULL)
return -ENOENT;
- if (n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
@@ -1128,7 +1139,7 @@ replay:
return err;
create_n_graft:
- if (!(n->nlmsg_flags&NLM_F_CREATE))
+ if (!(n->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
if (clid == TC_H_INGRESS) {
if (dev_ingress_queue(dev))
@@ -1175,6 +1186,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
struct gnet_dump d;
+ struct qdisc_size_table *stab;
nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
tcm = NLMSG_DATA(nlh);
@@ -1190,7 +1202,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
goto nla_put_failure;
q->qstats.qlen = q->q.qlen;
- if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
+ stab = rtnl_dereference(q->stab);
+ if (stab && qdisc_dump_stab(skb, stab) < 0)
goto nla_put_failure;
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
@@ -1234,16 +1247,19 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb,
return -ENOBUFS;
if (old && !tc_qdisc_dump_ignore(old)) {
- if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
+ if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
+ 0, RTM_DELQDISC) < 0)
goto err_out;
}
if (new && !tc_qdisc_dump_ignore(new)) {
- if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+ if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
+ old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
goto err_out;
}
if (skb->len)
- return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
err_out:
kfree_skb(skb);
@@ -1275,7 +1291,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
q_idx++;
continue;
}
- if (!tc_qdisc_dump_ignore(q) &&
+ if (!tc_qdisc_dump_ignore(q) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
goto done;
@@ -1356,7 +1372,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
u32 qid = TC_H_MAJ(clid);
int err;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return -ENODEV;
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -1391,9 +1408,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
qid = dev->qdisc->handle;
/* Now qid is genuine qdisc handle consistent
- both with parent and child.
-
- TC_H_MAJ(pid) still may be unspecified, complete it now.
+ * both with parent and child.
+ *
+ * TC_H_MAJ(pid) still may be unspecified, complete it now.
*/
if (pid)
pid = TC_H_MAKE(qid, pid);
@@ -1403,7 +1420,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
}
/* OK. Locate qdisc */
- if ((q = qdisc_lookup(dev, qid)) == NULL)
+ q = qdisc_lookup(dev, qid);
+ if (!q)
return -ENOENT;
/* An check that it supports classes */
@@ -1423,13 +1441,14 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if (cl == 0) {
err = -ENOENT;
- if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
+ if (n->nlmsg_type != RTM_NEWTCLASS ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
goto out;
} else {
switch (n->nlmsg_type) {
case RTM_NEWTCLASS:
err = -EEXIST;
- if (n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags & NLM_F_EXCL)
goto out;
break;
case RTM_DELTCLASS:
@@ -1521,14 +1540,14 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,
return -EINVAL;
}
- return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
}
-struct qdisc_dump_args
-{
- struct qdisc_walker w;
- struct sk_buff *skb;
- struct netlink_callback *cb;
+struct qdisc_dump_args {
+ struct qdisc_walker w;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
};
static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
@@ -1590,7 +1609,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
+ struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
struct net *net = sock_net(skb->sk);
struct netdev_queue *dev_queue;
struct net_device *dev;
@@ -1598,7 +1617,8 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
return 0;
- if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return 0;
s_t = cb->args[0];
@@ -1621,19 +1641,22 @@ done:
}
/* Main classifier routine: scans classifier chain attached
- to this qdisc, (optionally) tests for protocol and asks
- specific classifiers.
+ * to this qdisc, (optionally) tests for protocol and asks
+ * specific classifiers.
*/
int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
__be16 protocol = skb->protocol;
- int err = 0;
+ int err;
for (; tp; tp = tp->next) {
- if ((tp->protocol == protocol ||
- tp->protocol == htons(ETH_P_ALL)) &&
- (err = tp->classify(skb, tp, res)) >= 0) {
+ if (tp->protocol != protocol &&
+ tp->protocol != htons(ETH_P_ALL))
+ continue;
+ err = tp->classify(skb, tp, res);
+
+ if (err >= 0) {
#ifdef CONFIG_NET_CLS_ACT
if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
@@ -1664,11 +1687,11 @@ reclassify:
if (verd++ >= MAX_REC_LOOP) {
if (net_ratelimit())
- printk(KERN_NOTICE
- "%s: packet reclassify loop"
+ pr_notice("%s: packet reclassify loop"
" rule prio %u protocol %02x\n",
- tp->q->ops->id,
- tp->prio & 0xffff, ntohs(tp->protocol));
+ tp->q->ops->id,
+ tp->prio & 0xffff,
+ ntohs(tp->protocol));
return TC_ACT_SHOT;
}
skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
@@ -1761,7 +1784,7 @@ static int __init pktsched_init(void)
err = register_pernet_subsys(&psched_net_ops);
if (err) {
- printk(KERN_ERR "pktsched_init: "
+ pr_err("pktsched_init: "
"cannot initialize per netns operations\n");
return err;
}
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 943d733409d..3f08158b868 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -319,7 +319,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
* creation), and one for the reference held when calling delete.
*/
if (flow->ref < 2) {
- printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n", flow->ref);
+ pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
return -EINVAL;
}
if (flow->ref > 2)
@@ -384,12 +384,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
}
flow = NULL;
- done:
- ;
+done:
+ ;
}
- if (!flow)
+ if (!flow) {
flow = &p->link;
- else {
+ } else {
if (flow->vcc)
ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
/*@@@ looks good ... but it's not supposed to work :-) */
@@ -576,8 +576,7 @@ static void atm_tc_destroy(struct Qdisc *sch)
list_for_each_entry_safe(flow, tmp, &p->flows, list) {
if (flow->ref > 1)
- printk(KERN_ERR "atm_destroy: %p->ref = %d\n", flow,
- flow->ref);
+ pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
atm_tc_put(sch, (unsigned long)flow);
}
tasklet_kill(&p->task);
@@ -616,9 +615,8 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
}
if (flow->excess)
NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid);
- else {
+ else
NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0);
- }
nla_nest_end(skb, nest);
return skb->len;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index c80d1c210c5..24d94c097b3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -72,8 +72,7 @@
struct cbq_sched_data;
-struct cbq_class
-{
+struct cbq_class {
struct Qdisc_class_common common;
struct cbq_class *next_alive; /* next class with backlog in this priority band */
@@ -139,19 +138,18 @@ struct cbq_class
int refcnt;
int filters;
- struct cbq_class *defaults[TC_PRIO_MAX+1];
+ struct cbq_class *defaults[TC_PRIO_MAX + 1];
};
-struct cbq_sched_data
-{
+struct cbq_sched_data {
struct Qdisc_class_hash clhash; /* Hash table of all classes */
- int nclasses[TC_CBQ_MAXPRIO+1];
- unsigned quanta[TC_CBQ_MAXPRIO+1];
+ int nclasses[TC_CBQ_MAXPRIO + 1];
+ unsigned int quanta[TC_CBQ_MAXPRIO + 1];
struct cbq_class link;
- unsigned activemask;
- struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
+ unsigned int activemask;
+ struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
with backlog */
#ifdef CONFIG_NET_CLS_ACT
@@ -162,7 +160,7 @@ struct cbq_sched_data
int tx_len;
psched_time_t now; /* Cached timestamp */
psched_time_t now_rt; /* Cached real time */
- unsigned pmask;
+ unsigned int pmask;
struct hrtimer delay_timer;
struct qdisc_watchdog watchdog; /* Watchdog timer,
@@ -175,9 +173,9 @@ struct cbq_sched_data
};
-#define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
+#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
-static __inline__ struct cbq_class *
+static inline struct cbq_class *
cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
{
struct Qdisc_class_common *clc;
@@ -193,25 +191,27 @@ cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
static struct cbq_class *
cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
{
- struct cbq_class *cl, *new;
+ struct cbq_class *cl;
- for (cl = this->tparent; cl; cl = cl->tparent)
- if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
- return new;
+ for (cl = this->tparent; cl; cl = cl->tparent) {
+ struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
+ if (new != NULL && new != this)
+ return new;
+ }
return NULL;
}
#endif
/* Classify packet. The procedure is pretty complicated, but
- it allows us to combine link sharing and priority scheduling
- transparently.
-
- Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
- so that it resolves to split nodes. Then packets are classified
- by logical priority, or a more specific classifier may be attached
- to the split node.
+ * it allows us to combine link sharing and priority scheduling
+ * transparently.
+ *
+ * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
+ * so that it resolves to split nodes. Then packets are classified
+ * by logical priority, or a more specific classifier may be attached
+ * to the split node.
*/
static struct cbq_class *
@@ -227,7 +227,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
/*
* Step 1. If skb->priority points to one of our classes, use it.
*/
- if (TC_H_MAJ(prio^sch->handle) == 0 &&
+ if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
(cl = cbq_class_lookup(q, prio)) != NULL)
return cl;
@@ -243,10 +243,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
(result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
goto fallback;
- if ((cl = (void*)res.class) == NULL) {
+ cl = (void *)res.class;
+ if (!cl) {
if (TC_H_MAJ(res.classid))
cl = cbq_class_lookup(q, res.classid);
- else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
+ else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
cl = defmap[TC_PRIO_BESTEFFORT];
if (cl == NULL || cl->level >= head->level)
@@ -282,7 +283,7 @@ fallback:
* Step 4. No success...
*/
if (TC_H_MAJ(prio) == 0 &&
- !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
+ !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
!(cl = head->defaults[TC_PRIO_BESTEFFORT]))
return head;
@@ -290,12 +291,12 @@ fallback:
}
/*
- A packet has just been enqueued on the empty class.
- cbq_activate_class adds it to the tail of active class list
- of its priority band.
+ * A packet has just been enqueued on the empty class.
+ * cbq_activate_class adds it to the tail of active class list
+ * of its priority band.
*/
-static __inline__ void cbq_activate_class(struct cbq_class *cl)
+static inline void cbq_activate_class(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
int prio = cl->cpriority;
@@ -314,9 +315,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl)
}
/*
- Unlink class from active chain.
- Note that this same procedure is done directly in cbq_dequeue*
- during round-robin procedure.
+ * Unlink class from active chain.
+ * Note that this same procedure is done directly in cbq_dequeue*
+ * during round-robin procedure.
*/
static void cbq_deactivate_class(struct cbq_class *this)
@@ -350,7 +351,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
{
int toplevel = q->toplevel;
- if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
+ if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
psched_time_t now;
psched_tdiff_t incr;
@@ -363,7 +364,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
q->toplevel = cl->level;
return;
}
- } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
+ } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
}
}
@@ -390,7 +391,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, cl->q);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
cbq_mark_toplevel(q, cl);
if (!cl->next_alive)
cbq_activate_class(cl);
@@ -418,11 +418,11 @@ static void cbq_ovl_classic(struct cbq_class *cl)
delay += cl->offtime;
/*
- Class goes to sleep, so that it will have no
- chance to work avgidle. Let's forgive it 8)
-
- BTW cbq-2.0 has a crap in this
- place, apparently they forgot to shift it by cl->ewma_log.
+ * Class goes to sleep, so that it will have no
+ * chance to work avgidle. Let's forgive it 8)
+ *
+ * BTW cbq-2.0 has a crap in this
+ * place, apparently they forgot to shift it by cl->ewma_log.
*/
if (cl->avgidle < 0)
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
@@ -439,8 +439,8 @@ static void cbq_ovl_classic(struct cbq_class *cl)
q->wd_expires = delay;
/* Dirty work! We must schedule wakeups based on
- real available rate, rather than leaf rate,
- which may be tiny (even zero).
+ * real available rate, rather than leaf rate,
+ * which may be tiny (even zero).
*/
if (q->toplevel == TC_CBQ_MAXLEVEL) {
struct cbq_class *b;
@@ -460,7 +460,7 @@ static void cbq_ovl_classic(struct cbq_class *cl)
}
/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
- they go overlimit
+ * they go overlimit
*/
static void cbq_ovl_rclassic(struct cbq_class *cl)
@@ -595,7 +595,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
struct Qdisc *sch = q->watchdog.qdisc;
psched_time_t now;
psched_tdiff_t delay = 0;
- unsigned pmask;
+ unsigned int pmask;
now = psched_get_time();
@@ -624,7 +624,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
}
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
__netif_schedule(qdisc_root(sch));
return HRTIMER_NORESTART;
}
@@ -649,7 +649,6 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
ret = qdisc_enqueue(skb, cl->q);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
if (!cl->next_alive)
cbq_activate_class(cl);
return 0;
@@ -665,15 +664,15 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
#endif
/*
- It is mission critical procedure.
-
- We "regenerate" toplevel cutoff, if transmitting class
- has backlog and it is not regulated. It is not part of
- original CBQ description, but looks more reasonable.
- Probably, it is wrong. This question needs further investigation.
-*/
+ * It is mission critical procedure.
+ *
+ * We "regenerate" toplevel cutoff, if transmitting class
+ * has backlog and it is not regulated. It is not part of
+ * original CBQ description, but looks more reasonable.
+ * Probably, it is wrong. This question needs further investigation.
+ */
-static __inline__ void
+static inline void
cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
struct cbq_class *borrowed)
{
@@ -684,7 +683,7 @@ cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
q->toplevel = borrowed->level;
return;
}
- } while ((borrowed=borrowed->borrow) != NULL);
+ } while ((borrowed = borrowed->borrow) != NULL);
}
#if 0
/* It is not necessary now. Uncommenting it
@@ -712,10 +711,10 @@ cbq_update(struct cbq_sched_data *q)
cl->bstats.bytes += len;
/*
- (now - last) is total time between packet right edges.
- (last_pktlen/rate) is "virtual" busy time, so that
-
- idle = (now - last) - last_pktlen/rate
+ * (now - last) is total time between packet right edges.
+ * (last_pktlen/rate) is "virtual" busy time, so that
+ *
+ * idle = (now - last) - last_pktlen/rate
*/
idle = q->now - cl->last;
@@ -725,9 +724,9 @@ cbq_update(struct cbq_sched_data *q)
idle -= L2T(cl, len);
/* true_avgidle := (1-W)*true_avgidle + W*idle,
- where W=2^{-ewma_log}. But cl->avgidle is scaled:
- cl->avgidle == true_avgidle/W,
- hence:
+ * where W=2^{-ewma_log}. But cl->avgidle is scaled:
+ * cl->avgidle == true_avgidle/W,
+ * hence:
*/
avgidle += idle - (avgidle>>cl->ewma_log);
}
@@ -741,22 +740,22 @@ cbq_update(struct cbq_sched_data *q)
cl->avgidle = avgidle;
/* Calculate expected time, when this class
- will be allowed to send.
- It will occur, when:
- (1-W)*true_avgidle + W*delay = 0, i.e.
- idle = (1/W - 1)*(-true_avgidle)
- or
- idle = (1 - W)*(-cl->avgidle);
+ * will be allowed to send.
+ * It will occur, when:
+ * (1-W)*true_avgidle + W*delay = 0, i.e.
+ * idle = (1/W - 1)*(-true_avgidle)
+ * or
+ * idle = (1 - W)*(-cl->avgidle);
*/
idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
/*
- That is not all.
- To maintain the rate allocated to the class,
- we add to undertime virtual clock,
- necessary to complete transmitted packet.
- (len/phys_bandwidth has been already passed
- to the moment of cbq_update)
+ * That is not all.
+ * To maintain the rate allocated to the class,
+ * we add to undertime virtual clock,
+ * necessary to complete transmitted packet.
+ * (len/phys_bandwidth has been already passed
+ * to the moment of cbq_update)
*/
idle -= L2T(&q->link, len);
@@ -778,7 +777,7 @@ cbq_update(struct cbq_sched_data *q)
cbq_update_toplevel(q, this, q->tx_borrowed);
}
-static __inline__ struct cbq_class *
+static inline struct cbq_class *
cbq_under_limit(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
@@ -794,16 +793,17 @@ cbq_under_limit(struct cbq_class *cl)
do {
/* It is very suspicious place. Now overlimit
- action is generated for not bounded classes
- only if link is completely congested.
- Though it is in agree with ancestor-only paradigm,
- it looks very stupid. Particularly,
- it means that this chunk of code will either
- never be called or result in strong amplification
- of burstiness. Dangerous, silly, and, however,
- no another solution exists.
+ * action is generated for not bounded classes
+ * only if link is completely congested.
+ * Though it is in agree with ancestor-only paradigm,
+ * it looks very stupid. Particularly,
+ * it means that this chunk of code will either
+ * never be called or result in strong amplification
+ * of burstiness. Dangerous, silly, and, however,
+ * no another solution exists.
*/
- if ((cl = cl->borrow) == NULL) {
+ cl = cl->borrow;
+ if (!cl) {
this_cl->qstats.overlimits++;
this_cl->overlimit(this_cl);
return NULL;
@@ -816,7 +816,7 @@ cbq_under_limit(struct cbq_class *cl)
return cl;
}
-static __inline__ struct sk_buff *
+static inline struct sk_buff *
cbq_dequeue_prio(struct Qdisc *sch, int prio)
{
struct cbq_sched_data *q = qdisc_priv(sch);
@@ -840,7 +840,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
if (cl->deficit <= 0) {
/* Class exhausted its allotment per
- this round. Switch to the next one.
+ * this round. Switch to the next one.
*/
deficit = 1;
cl->deficit += cl->quantum;
@@ -850,8 +850,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
skb = cl->q->dequeue(cl->q);
/* Class did not give us any skb :-(
- It could occur even if cl->q->q.qlen != 0
- f.e. if cl->q == "tbf"
+ * It could occur even if cl->q->q.qlen != 0
+ * f.e. if cl->q == "tbf"
*/
if (skb == NULL)
goto skip_class;
@@ -880,7 +880,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
skip_class:
if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
/* Class is empty or penalized.
- Unlink it from active chain.
+ * Unlink it from active chain.
*/
cl_prev->next_alive = cl->next_alive;
cl->next_alive = NULL;
@@ -919,14 +919,14 @@ next_class:
return NULL;
}
-static __inline__ struct sk_buff *
+static inline struct sk_buff *
cbq_dequeue_1(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- unsigned activemask;
+ unsigned int activemask;
- activemask = q->activemask&0xFF;
+ activemask = q->activemask & 0xFF;
while (activemask) {
int prio = ffz(~activemask);
activemask &= ~(1<<prio);
@@ -951,11 +951,11 @@ cbq_dequeue(struct Qdisc *sch)
if (q->tx_class) {
psched_tdiff_t incr2;
/* Time integrator. We calculate EOS time
- by adding expected packet transmission time.
- If real time is greater, we warp artificial clock,
- so that:
-
- cbq_time = max(real_time, work);
+ * by adding expected packet transmission time.
+ * If real time is greater, we warp artificial clock,
+ * so that:
+ *
+ * cbq_time = max(real_time, work);
*/
incr2 = L2T(&q->link, q->tx_len);
q->now += incr2;
@@ -971,28 +971,29 @@ cbq_dequeue(struct Qdisc *sch)
skb = cbq_dequeue_1(sch);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
return skb;
}
/* All the classes are overlimit.
-
- It is possible, if:
-
- 1. Scheduler is empty.
- 2. Toplevel cutoff inhibited borrowing.
- 3. Root class is overlimit.
-
- Reset 2d and 3d conditions and retry.
-
- Note, that NS and cbq-2.0 are buggy, peeking
- an arbitrary class is appropriate for ancestor-only
- sharing, but not for toplevel algorithm.
-
- Our version is better, but slower, because it requires
- two passes, but it is unavoidable with top-level sharing.
- */
+ *
+ * It is possible, if:
+ *
+ * 1. Scheduler is empty.
+ * 2. Toplevel cutoff inhibited borrowing.
+ * 3. Root class is overlimit.
+ *
+ * Reset 2d and 3d conditions and retry.
+ *
+ * Note, that NS and cbq-2.0 are buggy, peeking
+ * an arbitrary class is appropriate for ancestor-only
+ * sharing, but not for toplevel algorithm.
+ *
+ * Our version is better, but slower, because it requires
+ * two passes, but it is unavoidable with top-level sharing.
+ */
if (q->toplevel == TC_CBQ_MAXLEVEL &&
q->link.undertime == PSCHED_PASTPERFECT)
@@ -1003,7 +1004,8 @@ cbq_dequeue(struct Qdisc *sch)
}
/* No packets in scheduler or nobody wants to give them to us :-(
- Sigh... start watchdog timer in the last case. */
+ * Sigh... start watchdog timer in the last case.
+ */
if (sch->q.qlen) {
sch->qstats.overlimits++;
@@ -1025,13 +1027,14 @@ static void cbq_adjust_levels(struct cbq_class *this)
int level = 0;
struct cbq_class *cl;
- if ((cl = this->children) != NULL) {
+ cl = this->children;
+ if (cl) {
do {
if (cl->level > level)
level = cl->level;
} while ((cl = cl->sibling) != this->children);
}
- this->level = level+1;
+ this->level = level + 1;
} while ((this = this->tparent) != NULL);
}
@@ -1047,14 +1050,15 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
for (h = 0; h < q->clhash.hashsize; h++) {
hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
/* BUGGGG... Beware! This expression suffer of
- arithmetic overflows!
+ * arithmetic overflows!
*/
if (cl->priority == prio) {
cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
q->quanta[prio];
}
if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
- printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
+ pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
+ cl->common.classid, cl->quantum);
cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
}
}
@@ -1065,18 +1069,18 @@ static void cbq_sync_defmap(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
struct cbq_class *split = cl->split;
- unsigned h;
+ unsigned int h;
int i;
if (split == NULL)
return;
- for (i=0; i<=TC_PRIO_MAX; i++) {
- if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
+ if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
split->defaults[i] = NULL;
}
- for (i=0; i<=TC_PRIO_MAX; i++) {
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
int level = split->level;
if (split->defaults[i])
@@ -1089,7 +1093,7 @@ static void cbq_sync_defmap(struct cbq_class *cl)
hlist_for_each_entry(c, n, &q->clhash.hash[h],
common.hnode) {
if (c->split == split && c->level < level &&
- c->defmap&(1<<i)) {
+ c->defmap & (1<<i)) {
split->defaults[i] = c;
level = c->level;
}
@@ -1103,7 +1107,8 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
struct cbq_class *split = NULL;
if (splitid == 0) {
- if ((split = cl->split) == NULL)
+ split = cl->split;
+ if (!split)
return;
splitid = split->common.classid;
}
@@ -1121,9 +1126,9 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
cl->defmap = 0;
cbq_sync_defmap(cl);
cl->split = split;
- cl->defmap = def&mask;
+ cl->defmap = def & mask;
} else
- cl->defmap = (cl->defmap&~mask)|(def&mask);
+ cl->defmap = (cl->defmap & ~mask) | (def & mask);
cbq_sync_defmap(cl);
}
@@ -1136,7 +1141,7 @@ static void cbq_unlink_class(struct cbq_class *this)
qdisc_class_hash_remove(&q->clhash, &this->common);
if (this->tparent) {
- clp=&this->sibling;
+ clp = &this->sibling;
cl = *clp;
do {
if (cl == this) {
@@ -1175,7 +1180,7 @@ static void cbq_link_class(struct cbq_class *this)
}
}
-static unsigned int cbq_drop(struct Qdisc* sch)
+static unsigned int cbq_drop(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl, *cl_head;
@@ -1183,7 +1188,8 @@ static unsigned int cbq_drop(struct Qdisc* sch)
unsigned int len;
for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
- if ((cl_head = q->active[prio]) == NULL)
+ cl_head = q->active[prio];
+ if (!cl_head)
continue;
cl = cl_head;
@@ -1200,13 +1206,13 @@ static unsigned int cbq_drop(struct Qdisc* sch)
}
static void
-cbq_reset(struct Qdisc* sch)
+cbq_reset(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl;
struct hlist_node *n;
int prio;
- unsigned h;
+ unsigned int h;
q->activemask = 0;
q->pmask = 0;
@@ -1238,21 +1244,21 @@ cbq_reset(struct Qdisc* sch)
static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
{
- if (lss->change&TCF_CBQ_LSS_FLAGS) {
- cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
- cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
+ if (lss->change & TCF_CBQ_LSS_FLAGS) {
+ cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
+ cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
}
- if (lss->change&TCF_CBQ_LSS_EWMA)
+ if (lss->change & TCF_CBQ_LSS_EWMA)
cl->ewma_log = lss->ewma_log;
- if (lss->change&TCF_CBQ_LSS_AVPKT)
+ if (lss->change & TCF_CBQ_LSS_AVPKT)
cl->avpkt = lss->avpkt;
- if (lss->change&TCF_CBQ_LSS_MINIDLE)
+ if (lss->change & TCF_CBQ_LSS_MINIDLE)
cl->minidle = -(long)lss->minidle;
- if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
+ if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
cl->maxidle = lss->maxidle;
cl->avgidle = lss->maxidle;
}
- if (lss->change&TCF_CBQ_LSS_OFFTIME)
+ if (lss->change & TCF_CBQ_LSS_OFFTIME)
cl->offtime = lss->offtime;
return 0;
}
@@ -1280,10 +1286,10 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
if (wrr->weight)
cl->weight = wrr->weight;
if (wrr->priority) {
- cl->priority = wrr->priority-1;
+ cl->priority = wrr->priority - 1;
cl->cpriority = cl->priority;
if (cl->priority >= cl->priority2)
- cl->priority2 = TC_CBQ_MAXPRIO-1;
+ cl->priority2 = TC_CBQ_MAXPRIO - 1;
}
cbq_addprio(q, cl);
@@ -1300,10 +1306,10 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
cl->overlimit = cbq_ovl_delay;
break;
case TC_CBQ_OVL_LOWPRIO:
- if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
- ovl->priority2-1 <= cl->priority)
+ if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
+ ovl->priority2 - 1 <= cl->priority)
return -EINVAL;
- cl->priority2 = ovl->priority2-1;
+ cl->priority2 = ovl->priority2 - 1;
cl->overlimit = cbq_ovl_lowprio;
break;
case TC_CBQ_OVL_DROP:
@@ -1382,9 +1388,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
if (!q->link.q)
q->link.q = &noop_qdisc;
- q->link.priority = TC_CBQ_MAXPRIO-1;
- q->link.priority2 = TC_CBQ_MAXPRIO-1;
- q->link.cpriority = TC_CBQ_MAXPRIO-1;
+ q->link.priority = TC_CBQ_MAXPRIO - 1;
+ q->link.priority2 = TC_CBQ_MAXPRIO - 1;
+ q->link.cpriority = TC_CBQ_MAXPRIO - 1;
q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
q->link.overlimit = cbq_ovl_classic;
q->link.allot = psched_mtu(qdisc_dev(sch));
@@ -1415,7 +1421,7 @@ put_rtab:
return err;
}
-static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
@@ -1427,7 +1433,7 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_lssopt opt;
@@ -1452,15 +1458,15 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_wrropt opt;
opt.flags = 0;
opt.allot = cl->allot;
- opt.priority = cl->priority+1;
- opt.cpriority = cl->cpriority+1;
+ opt.priority = cl->priority + 1;
+ opt.cpriority = cl->cpriority + 1;
opt.weight = cl->weight;
NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
return skb->len;
@@ -1470,13 +1476,13 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_ovl opt;
opt.strategy = cl->ovl_strategy;
- opt.priority2 = cl->priority2+1;
+ opt.priority2 = cl->priority2 + 1;
opt.pad = 0;
opt.penalty = cl->penalty;
NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
@@ -1487,7 +1493,7 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_fopt opt;
@@ -1506,7 +1512,7 @@ nla_put_failure:
}
#ifdef CONFIG_NET_CLS_ACT
-static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_police opt;
@@ -1570,7 +1576,7 @@ static int
cbq_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
struct nlattr *nest;
if (cl->tparent)
@@ -1598,7 +1604,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct gnet_dump *d)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
cl->qstats.qlen = cl->q->q.qlen;
cl->xstats.avgidle = cl->avgidle;
@@ -1618,7 +1624,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
if (new == NULL) {
new = qdisc_create_dflt(sch->dev_queue,
@@ -1641,10 +1647,9 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
return 0;
}
-static struct Qdisc *
-cbq_leaf(struct Qdisc *sch, unsigned long arg)
+static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
return cl->q;
}
@@ -1683,13 +1688,12 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
kfree(cl);
}
-static void
-cbq_destroy(struct Qdisc* sch)
+static void cbq_destroy(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct hlist_node *n, *next;
struct cbq_class *cl;
- unsigned h;
+ unsigned int h;
#ifdef CONFIG_NET_CLS_ACT
q->rx_class = NULL;
@@ -1713,7 +1717,7 @@ cbq_destroy(struct Qdisc* sch)
static void cbq_put(struct Qdisc *sch, unsigned long arg)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
if (--cl->refcnt == 0) {
#ifdef CONFIG_NET_CLS_ACT
@@ -1736,7 +1740,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
{
int err;
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class*)*arg;
+ struct cbq_class *cl = (struct cbq_class *)*arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_CBQ_MAX + 1];
struct cbq_class *parent;
@@ -1828,13 +1832,14 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
if (classid) {
err = -EINVAL;
- if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
+ if (TC_H_MAJ(classid ^ sch->handle) ||
+ cbq_class_lookup(q, classid))
goto failure;
} else {
int i;
- classid = TC_H_MAKE(sch->handle,0x8000);
+ classid = TC_H_MAKE(sch->handle, 0x8000);
- for (i=0; i<0x8000; i++) {
+ for (i = 0; i < 0x8000; i++) {
if (++q->hgenerator >= 0x8000)
q->hgenerator = 1;
if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
@@ -1891,11 +1896,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
cl->minidle = -0x7FFFFFFF;
cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
- if (cl->ewma_log==0)
+ if (cl->ewma_log == 0)
cl->ewma_log = q->link.ewma_log;
- if (cl->maxidle==0)
+ if (cl->maxidle == 0)
cl->maxidle = q->link.maxidle;
- if (cl->avpkt==0)
+ if (cl->avpkt == 0)
cl->avpkt = q->link.avpkt;
cl->overlimit = cbq_ovl_classic;
if (tb[TCA_CBQ_OVL_STRATEGY])
@@ -1921,7 +1926,7 @@ failure:
static int cbq_delete(struct Qdisc *sch, unsigned long arg)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
unsigned int qlen;
if (cl->filters || cl->children || cl == &q->link)
@@ -1979,7 +1984,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *p = (struct cbq_class*)parent;
+ struct cbq_class *p = (struct cbq_class *)parent;
struct cbq_class *cl = cbq_class_lookup(q, classid);
if (cl) {
@@ -1993,7 +1998,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
cl->filters--;
}
@@ -2003,7 +2008,7 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl;
struct hlist_node *n;
- unsigned h;
+ unsigned int h;
if (arg->stop)
return;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
new file mode 100644
index 00000000000..ee1e2090eeb
--- /dev/null
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,677 @@
+/*
+ * net/sched/sch_choke.c CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+
+/*
+ CHOKe stateless AQM for fair bandwidth allocation
+ =================================================
+
+ CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+ unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+ maintains no flow state. The difference from RED is an additional step
+ during the enqueuing process. If average queue size is over the
+ low threshold (qmin), a packet is chosen at random from the queue.
+ If both the new and chosen packet are from the same flow, both
+ are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+ needs to access packets in queue randomly. It has a minimal class
+ interface to allow overriding the builtin flow classifier with
+ filters.
+
+ Source:
+ R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+ Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+ IEEE INFOCOM, 2000.
+
+ A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+ Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE (128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+ u32 limit;
+ unsigned char flags;
+
+ struct red_parms parms;
+
+/* Variables */
+ struct tcf_proto *filter_list;
+ struct {
+ u32 prob_drop; /* Early probability drops */
+ u32 prob_mark; /* Early probability marks */
+ u32 forced_drop; /* Forced drops, qavg > max_thresh */
+ u32 forced_mark; /* Forced marks, qavg > max_thresh */
+ u32 pdrop; /* Drops due to queue limits */
+ u32 other; /* Drops due to drop() calls */
+ u32 matched; /* Drops to flow match */
+ } stats;
+
+ unsigned int head;
+ unsigned int tail;
+
+ unsigned int tab_mask; /* size - 1 */
+
+ struct sk_buff **tab;
+};
+
+/* deliver a random number between 0 and N - 1 */
+static u32 random_N(unsigned int N)
+{
+ return reciprocal_divide(random32(), N);
+}
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+ return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+ do {
+ q->head = (q->head + 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+ do {
+ q->tail = (q->tail - 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = q->tab[idx];
+
+ q->tab[idx] = NULL;
+
+ if (idx == q->head)
+ choke_zap_head_holes(q);
+ if (idx == q->tail)
+ choke_zap_tail_holes(q);
+
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_drop(skb, sch);
+ qdisc_tree_decrease_qlen(sch, 1);
+ --sch->q.qlen;
+}
+
+/*
+ * Compare flow of two packets
+ * Returns true only if source and destination address and port match.
+ * false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+ struct sk_buff *skb2)
+{
+ int off1, off2, poff;
+ const u32 *ports1, *ports2;
+ u8 ip_proto;
+ __u32 hash1;
+
+ if (skb1->protocol != skb2->protocol)
+ return false;
+
+ /* Use hash value as quick check
+ * Assumes that __skb_get_rxhash makes IP header and ports linear
+ */
+ hash1 = skb_get_rxhash(skb1);
+ if (!hash1 || hash1 != skb_get_rxhash(skb2))
+ return false;
+
+ /* Probably match, but be sure to avoid hash collisions */
+ off1 = skb_network_offset(skb1);
+ off2 = skb_network_offset(skb2);
+
+ switch (skb1->protocol) {
+ case __constant_htons(ETH_P_IP): {
+ const struct iphdr *ip1, *ip2;
+
+ ip1 = (const struct iphdr *) (skb1->data + off1);
+ ip2 = (const struct iphdr *) (skb2->data + off2);
+
+ ip_proto = ip1->protocol;
+ if (ip_proto != ip2->protocol ||
+ ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
+ return false;
+
+ if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET))
+ ip_proto = 0;
+ off1 += ip1->ihl * 4;
+ off2 += ip2->ihl * 4;
+ break;
+ }
+
+ case __constant_htons(ETH_P_IPV6): {
+ const struct ipv6hdr *ip1, *ip2;
+
+ ip1 = (const struct ipv6hdr *) (skb1->data + off1);
+ ip2 = (const struct ipv6hdr *) (skb2->data + off2);
+
+ ip_proto = ip1->nexthdr;
+ if (ip_proto != ip2->nexthdr ||
+ ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
+ ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
+ return false;
+ off1 += 40;
+ off2 += 40;
+ }
+
+ default: /* Maybe compare MAC header here? */
+ return false;
+ }
+
+ poff = proto_ports_offset(ip_proto);
+ if (poff < 0)
+ return true;
+
+ off1 += poff;
+ off2 += poff;
+
+ ports1 = (__force u32 *)(skb1->data + off1);
+ ports2 = (__force u32 *)(skb2->data + off2);
+ return *ports1 == *ports2;
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+ *(unsigned int *)(qdisc_skb_cb(skb)->data) = classid;
+}
+
+static u16 choke_get_classid(const struct sk_buff *skb)
+{
+ return *(unsigned int *)(qdisc_skb_cb(skb)->data);
+}
+
+/*
+ * Classify flow using either:
+ * 1. pre-existing classification result in skb
+ * 2. fast internal classification
+ * 3. use TC filter based classification
+ */
+static bool choke_classify(struct sk_buff *skb,
+ struct Qdisc *sch, int *qerr)
+
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tcf_result res;
+ int result;
+
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ choke_set_classid(skb, TC_H_MIN(res.classid));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ * times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+ unsigned int *pidx)
+{
+ struct sk_buff *skb;
+ int retrys = 3;
+
+ do {
+ *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+ skb = q->tab[*pidx];
+ if (skb)
+ return skb;
+ } while (--retrys > 0);
+
+ return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+ struct sk_buff *nskb,
+ unsigned int *pidx)
+{
+ struct sk_buff *oskb;
+
+ if (q->head == q->tail)
+ return false;
+
+ oskb = choke_peek_random(q, pidx);
+ if (q->filter_list)
+ return choke_get_classid(nskb) == choke_get_classid(oskb);
+
+ return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct red_parms *p = &q->parms;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+ if (q->filter_list) {
+ /* If using external classifiers, get result and record it. */
+ if (!choke_classify(skb, sch, &ret))
+ goto other_drop; /* Packet was eaten by filter */
+ }
+
+ /* Compute average queue usage (see RED) */
+ p->qavg = red_calc_qavg(p, sch->q.qlen);
+ if (red_is_idling(p))
+ red_end_of_idle_period(p);
+
+ /* Is queue small? */
+ if (p->qavg <= p->qth_min)
+ p->qcount = -1;
+ else {
+ unsigned int idx;
+
+ /* Draw a packet at random from queue and compare flow */
+ if (choke_match_random(q, skb, &idx)) {
+ q->stats.matched++;
+ choke_drop_by_idx(sch, idx);
+ goto congestion_drop;
+ }
+
+ /* Queue is large, always mark/drop */
+ if (p->qavg > p->qth_max) {
+ p->qcount = -1;
+
+ sch->qstats.overlimits++;
+ if (use_harddrop(q) || !use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ } else if (++p->qcount) {
+ if (red_mark_probability(p, p->qavg)) {
+ p->qcount = 0;
+ p->qR = red_random(p);
+
+ sch->qstats.overlimits++;
+ if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ }
+ } else
+ p->qR = red_random(p);
+ }
+
+ /* Admit new packet */
+ if (sch->q.qlen < q->limit) {
+ q->tab[q->tail] = skb;
+ q->tail = (q->tail + 1) & q->tab_mask;
+ ++sch->q.qlen;
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ q->stats.pdrop++;
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+
+ congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+
+ other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ if (q->head == q->tail) {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ return NULL;
+ }
+
+ skb = q->tab[q->head];
+ q->tab[q->head] = NULL;
+ choke_zap_head_holes(q);
+ --sch->q.qlen;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_bstats_update(sch, skb);
+
+ return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ unsigned int len;
+
+ len = qdisc_queue_drop(sch);
+ if (len > 0)
+ q->stats.other++;
+ else {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ }
+
+ return len;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+ [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+ if (addr) {
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+ }
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CHOKE_MAX + 1];
+ const struct tc_red_qopt *ctl;
+ int err;
+ struct sk_buff **old = NULL;
+ unsigned int mask;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CHOKE_PARMS] == NULL ||
+ tb[TCA_CHOKE_STAB] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+ if (ctl->limit > CHOKE_MAX_QUEUE)
+ return -EINVAL;
+
+ mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+ if (mask != q->tab_mask) {
+ struct sk_buff **ntab;
+
+ ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
+ if (!ntab)
+ ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+ if (!ntab)
+ return -ENOMEM;
+
+ sch_tree_lock(sch);
+ old = q->tab;
+ if (old) {
+ unsigned int oqlen = sch->q.qlen, tail = 0;
+
+ while (q->head != q->tail) {
+ struct sk_buff *skb = q->tab[q->head];
+
+ q->head = (q->head + 1) & q->tab_mask;
+ if (!skb)
+ continue;
+ if (tail < mask) {
+ ntab[tail++] = skb;
+ continue;
+ }
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ --sch->q.qlen;
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+ q->head = 0;
+ q->tail = tail;
+ }
+
+ q->tab_mask = mask;
+ q->tab = ntab;
+ } else
+ sch_tree_lock(sch);
+
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+
+ red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_CHOKE_STAB]));
+
+ if (q->head == q->tail)
+ red_end_of_idle_period(&q->parms);
+
+ sch_tree_unlock(sch);
+ choke_free(old);
+ return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tc_choke_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .matched = q->stats.matched,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ if (!arg->stop) {
+ if (arg->fn(sch, 1, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+ .leaf = choke_leaf,
+ .get = choke_get,
+ .put = choke_put,
+ .tcf_chain = choke_find_tcf,
+ .bind_tcf = choke_bind,
+ .unbind_tcf = choke_put,
+ .dump = choke_dump_class,
+ .walk = choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+ .id = "choke",
+ .priv_size = sizeof(struct choke_sched_data),
+
+ .enqueue = choke_enqueue,
+ .dequeue = choke_dequeue,
+ .peek = choke_peek_head,
+ .drop = choke_drop,
+ .init = choke_init,
+ .destroy = choke_destroy,
+ .reset = choke_reset,
+ .change = choke_change,
+ .dump = choke_dump,
+ .dump_stats = choke_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+ return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+ unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index de55e642eaf..6b7fe4a84f1 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -376,7 +376,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
bstats_update(&cl->bstats, skb);
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return err;
@@ -403,6 +402,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
skb = qdisc_dequeue_peeked(cl->qdisc);
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 60f4bdd4408..2c790204d04 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -137,10 +137,10 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
if (tb[TCA_DSMARK_VALUE])
- p->value[*arg-1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
+ p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
if (tb[TCA_DSMARK_MASK])
- p->mask[*arg-1] = mask;
+ p->mask[*arg - 1] = mask;
err = 0;
@@ -155,8 +155,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
if (!dsmark_valid_index(p, arg))
return -EINVAL;
- p->mask[arg-1] = 0xff;
- p->value[arg-1] = 0;
+ p->mask[arg - 1] = 0xff;
+ p->value[arg - 1] = 0;
return 0;
}
@@ -175,7 +175,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
if (p->mask[i] == 0xff && !p->value[i])
goto ignore;
if (walker->count >= walker->skip) {
- if (walker->fn(sch, i+1, walker) < 0) {
+ if (walker->fn(sch, i + 1, walker) < 0) {
walker->stop = 1;
break;
}
@@ -260,7 +260,6 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return err;
}
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -283,6 +282,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
if (skb == NULL)
return NULL;
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
index = skb->tc_index & (p->indices - 1);
@@ -304,9 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
* and don't need yet another qdisc as a bypass.
*/
if (p->mask[index] != 0xff || p->value[index])
- printk(KERN_WARNING
- "dsmark_dequeue: unsupported protocol %d\n",
- ntohs(skb->protocol));
+ pr_warning("dsmark_dequeue: unsupported protocol %d\n",
+ ntohs(skb->protocol));
break;
}
@@ -424,14 +423,14 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
if (!dsmark_valid_index(p, cl))
return -EINVAL;
- tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1);
+ tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
tcm->tcm_info = p->q->handle;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
- NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl-1]);
- NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl-1]);
+ NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]);
+ NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]);
return nla_nest_end(skb, opts);
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index aa4d6337e43..be33f9ddf9d 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -19,12 +19,11 @@
/* 1 band FIFO pseudo-"scheduler" */
-struct fifo_sched_data
-{
+struct fifo_sched_data {
u32 limit;
};
-static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct fifo_sched_data *q = qdisc_priv(sch);
@@ -34,7 +33,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
return qdisc_reshape_fail(skb, sch);
}
-static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct fifo_sched_data *q = qdisc_priv(sch);
@@ -44,19 +43,16 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
return qdisc_reshape_fail(skb, sch);
}
-static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- struct sk_buff *skb_head;
struct fifo_sched_data *q = qdisc_priv(sch);
if (likely(skb_queue_len(&sch->q) < q->limit))
return qdisc_enqueue_tail(skb, sch);
/* queue full, remove one skb to fulfill the limit */
- skb_head = qdisc_dequeue_head(sch);
+ __qdisc_queue_drop_head(sch, &sch->q);
sch->qstats.drops++;
- kfree_skb(skb_head);
-
qdisc_enqueue_tail(skb, sch);
return NET_XMIT_CN;
@@ -65,11 +61,13 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch)
static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
{
struct fifo_sched_data *q = qdisc_priv(sch);
+ bool bypass;
+ bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
if (opt == NULL) {
u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
- if (sch->ops == &bfifo_qdisc_ops)
+ if (is_bfifo)
limit *= psched_mtu(qdisc_dev(sch));
q->limit = limit;
@@ -82,6 +80,15 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
q->limit = ctl->limit;
}
+ if (is_bfifo)
+ bypass = q->limit >= psched_mtu(qdisc_dev(sch));
+ else
+ bypass = q->limit >= 1;
+
+ if (bypass)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 34dc598440a..0da09d50873 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -87,8 +87,8 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
*/
kfree_skb(skb);
if (net_ratelimit())
- printk(KERN_WARNING "Dead loop on netdevice %s, "
- "fix it urgently!\n", dev_queue->dev->name);
+ pr_warning("Dead loop on netdevice %s, fix it urgently!\n",
+ dev_queue->dev->name);
ret = qdisc_qlen(q);
} else {
/*
@@ -137,8 +137,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
} else {
/* Driver returned NETDEV_TX_BUSY - requeue skb */
if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
- printk(KERN_WARNING "BUG %s code %d qlen %d\n",
- dev->name, ret, q->q.qlen);
+ pr_warning("BUG %s code %d qlen %d\n",
+ dev->name, ret, q->q.qlen);
ret = dev_requeue_skb(skb, q);
}
@@ -412,8 +412,9 @@ static struct Qdisc noqueue_qdisc = {
};
-static const u8 prio2band[TC_PRIO_MAX+1] =
- { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
/* 3-band FIFO queue: old style, but should be a bit faster than
generic prio+fifo combination.
@@ -445,7 +446,7 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
return priv->q + band;
}
-static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
{
if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
@@ -460,7 +461,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
return qdisc_drop(skb, qdisc);
}
-static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
@@ -479,7 +480,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
return NULL;
}
-static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
@@ -493,7 +494,7 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
return NULL;
}
-static void pfifo_fast_reset(struct Qdisc* qdisc)
+static void pfifo_fast_reset(struct Qdisc *qdisc)
{
int prio;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
@@ -510,7 +511,7 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
- memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
+ memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
@@ -526,6 +527,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
skb_queue_head_init(band2list(priv, prio));
+ /* Can by-pass the queue discipline */
+ qdisc->flags |= TCQ_F_CAN_BYPASS;
return 0;
}
@@ -540,6 +543,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.dump = pfifo_fast_dump,
.owner = THIS_MODULE,
};
+EXPORT_SYMBOL(pfifo_fast_ops);
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
struct Qdisc_ops *ops)
@@ -630,7 +634,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
#ifdef CONFIG_NET_SCHED
qdisc_list_del(qdisc);
- qdisc_put_stab(qdisc->stab);
+ qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
if (ops->reset)
@@ -674,25 +678,21 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
return oqdisc;
}
+EXPORT_SYMBOL(dev_graft_qdisc);
static void attach_one_default_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_unused)
{
- struct Qdisc *qdisc;
+ struct Qdisc *qdisc = &noqueue_qdisc;
if (dev->tx_queue_len) {
qdisc = qdisc_create_dflt(dev_queue,
&pfifo_fast_ops, TC_H_ROOT);
if (!qdisc) {
- printk(KERN_INFO "%s: activation failed\n", dev->name);
+ netdev_info(dev, "activation failed\n");
return;
}
-
- /* Can by-pass the queue discipline for default qdisc */
- qdisc->flags |= TCQ_F_CAN_BYPASS;
- } else {
- qdisc = &noqueue_qdisc;
}
dev_queue->qdisc_sleeping = qdisc;
}
@@ -761,6 +761,7 @@ void dev_activate(struct net_device *dev)
dev_watchdog_up(dev);
}
}
+EXPORT_SYMBOL(dev_activate);
static void dev_deactivate_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
@@ -840,6 +841,7 @@ void dev_deactivate(struct net_device *dev)
list_add(&dev->unreg_list, &single);
dev_deactivate_many(&single);
}
+EXPORT_SYMBOL(dev_deactivate);
static void dev_init_scheduler_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 51dcc2aa5c9..b9493a09a87 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -32,8 +32,7 @@
struct gred_sched_data;
struct gred_sched;
-struct gred_sched_data
-{
+struct gred_sched_data {
u32 limit; /* HARD maximal queue length */
u32 DP; /* the drop pramaters */
u32 bytesin; /* bytes seen on virtualQ so far*/
@@ -50,8 +49,7 @@ enum {
GRED_RIO_MODE,
};
-struct gred_sched
-{
+struct gred_sched {
struct gred_sched_data *tab[MAX_DPs];
unsigned long flags;
u32 red_flags;
@@ -150,17 +148,18 @@ static inline int gred_use_harddrop(struct gred_sched *t)
return t->red_flags & TC_RED_HARDDROP;
}
-static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- struct gred_sched_data *q=NULL;
- struct gred_sched *t= qdisc_priv(sch);
+ struct gred_sched_data *q = NULL;
+ struct gred_sched *t = qdisc_priv(sch);
unsigned long qavg = 0;
u16 dp = tc_index_to_dp(skb);
- if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
dp = t->def;
- if ((q = t->tab[dp]) == NULL) {
+ q = t->tab[dp];
+ if (!q) {
/* Pass through packets not assigned to a DP
* if no default DP has been configured. This
* allows for DP flows to be left untouched.
@@ -183,7 +182,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
for (i = 0; i < t->DPs; i++) {
if (t->tab[i] && t->tab[i]->prio < q->prio &&
!red_is_idling(&t->tab[i]->parms))
- qavg +=t->tab[i]->parms.qavg;
+ qavg += t->tab[i]->parms.qavg;
}
}
@@ -203,28 +202,28 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
gred_store_wred_set(t, q);
switch (red_action(&q->parms, q->parms.qavg + qavg)) {
- case RED_DONT_MARK:
- break;
-
- case RED_PROB_MARK:
- sch->qstats.overlimits++;
- if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
- q->stats.prob_drop++;
- goto congestion_drop;
- }
-
- q->stats.prob_mark++;
- break;
-
- case RED_HARD_MARK:
- sch->qstats.overlimits++;
- if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
- !INET_ECN_set_ce(skb)) {
- q->stats.forced_drop++;
- goto congestion_drop;
- }
- q->stats.forced_mark++;
- break;
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ q->stats.forced_mark++;
+ break;
}
if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
@@ -241,7 +240,7 @@ congestion_drop:
return NET_XMIT_CN;
}
-static struct sk_buff *gred_dequeue(struct Qdisc* sch)
+static struct sk_buff *gred_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
@@ -254,9 +253,9 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
if (net_ratelimit())
- printk(KERN_WARNING "GRED: Unable to relocate "
- "VQ 0x%x after dequeue, screwing up "
- "backlog.\n", tc_index_to_dp(skb));
+ pr_warning("GRED: Unable to relocate VQ 0x%x "
+ "after dequeue, screwing up "
+ "backlog.\n", tc_index_to_dp(skb));
} else {
q->backlog -= qdisc_pkt_len(skb);
@@ -273,7 +272,7 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)
return NULL;
}
-static unsigned int gred_drop(struct Qdisc* sch)
+static unsigned int gred_drop(struct Qdisc *sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
@@ -286,9 +285,9 @@ static unsigned int gred_drop(struct Qdisc* sch)
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
if (net_ratelimit())
- printk(KERN_WARNING "GRED: Unable to relocate "
- "VQ 0x%x while dropping, screwing up "
- "backlog.\n", tc_index_to_dp(skb));
+ pr_warning("GRED: Unable to relocate VQ 0x%x "
+ "while dropping, screwing up "
+ "backlog.\n", tc_index_to_dp(skb));
} else {
q->backlog -= len;
q->stats.other++;
@@ -308,7 +307,7 @@ static unsigned int gred_drop(struct Qdisc* sch)
}
-static void gred_reset(struct Qdisc* sch)
+static void gred_reset(struct Qdisc *sch)
{
int i;
struct gred_sched *t = qdisc_priv(sch);
@@ -369,8 +368,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
for (i = table->DPs; i < MAX_DPs; i++) {
if (table->tab[i]) {
- printk(KERN_WARNING "GRED: Warning: Destroying "
- "shadowed VQ 0x%x\n", i);
+ pr_warning("GRED: Warning: Destroying "
+ "shadowed VQ 0x%x\n", i);
gred_destroy_vq(table->tab[i]);
table->tab[i] = NULL;
}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 2e45791d4f6..6488e642565 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -81,8 +81,7 @@
* that are expensive on 32-bit architectures.
*/
-struct internal_sc
-{
+struct internal_sc {
u64 sm1; /* scaled slope of the 1st segment */
u64 ism1; /* scaled inverse-slope of the 1st segment */
u64 dx; /* the x-projection of the 1st segment */
@@ -92,8 +91,7 @@ struct internal_sc
};
/* runtime service curve */
-struct runtime_sc
-{
+struct runtime_sc {
u64 x; /* current starting position on x-axis */
u64 y; /* current starting position on y-axis */
u64 sm1; /* scaled slope of the 1st segment */
@@ -104,15 +102,13 @@ struct runtime_sc
u64 ism2; /* scaled inverse-slope of the 2nd segment */
};
-enum hfsc_class_flags
-{
+enum hfsc_class_flags {
HFSC_RSC = 0x1,
HFSC_FSC = 0x2,
HFSC_USC = 0x4
};
-struct hfsc_class
-{
+struct hfsc_class {
struct Qdisc_class_common cl_common;
unsigned int refcnt; /* usage count */
@@ -140,8 +136,8 @@ struct hfsc_class
u64 cl_cumul; /* cumulative work in bytes done by
real-time criteria */
- u64 cl_d; /* deadline*/
- u64 cl_e; /* eligible time */
+ u64 cl_d; /* deadline*/
+ u64 cl_e; /* eligible time */
u64 cl_vt; /* virtual time */
u64 cl_f; /* time when this class will fit for
link-sharing, max(myf, cfmin) */
@@ -176,8 +172,7 @@ struct hfsc_class
unsigned long cl_nactive; /* number of active children */
};
-struct hfsc_sched
-{
+struct hfsc_sched {
u16 defcls; /* default class id */
struct hfsc_class root; /* root class */
struct Qdisc_class_hash clhash; /* class hash */
@@ -693,7 +688,7 @@ init_vf(struct hfsc_class *cl, unsigned int len)
if (go_active) {
n = rb_last(&cl->cl_parent->vt_tree);
if (n != NULL) {
- max_cl = rb_entry(n, struct hfsc_class,vt_node);
+ max_cl = rb_entry(n, struct hfsc_class, vt_node);
/*
* set vt to the average of the min and max
* classes. if the parent's period didn't
@@ -1177,8 +1172,10 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
return NULL;
}
#endif
- if ((cl = (struct hfsc_class *)res.class) == NULL) {
- if ((cl = hfsc_find_class(res.classid, sch)) == NULL)
+ cl = (struct hfsc_class *)res.class;
+ if (!cl) {
+ cl = hfsc_find_class(res.classid, sch);
+ if (!cl)
break; /* filter selected invalid classid */
if (cl->level >= head->level)
break; /* filter may only point downwards */
@@ -1316,7 +1313,7 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
return -1;
}
-static inline int
+static int
hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
{
if ((cl->cl_flags & HFSC_RSC) &&
@@ -1420,7 +1417,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch)
struct hfsc_class *cl;
u64 next_time = 0;
- if ((cl = eltree_get_minel(q)) != NULL)
+ cl = eltree_get_minel(q);
+ if (cl)
next_time = cl->cl_e;
if (q->root.cl_cfmin != 0) {
if (next_time == 0 || next_time > q->root.cl_cfmin)
@@ -1600,7 +1598,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
set_active(cl, qdisc_pkt_len(skb));
bstats_update(&cl->bstats, skb);
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -1626,7 +1623,8 @@ hfsc_dequeue(struct Qdisc *sch)
* find the class with the minimum deadline among
* the eligible classes.
*/
- if ((cl = eltree_get_mindl(q, cur_time)) != NULL) {
+ cl = eltree_get_mindl(q, cur_time);
+ if (cl) {
realtime = 1;
} else {
/*
@@ -1665,7 +1663,8 @@ hfsc_dequeue(struct Qdisc *sch)
set_passive(cl);
}
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 984c1b0c683..e1429a85091 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -99,9 +99,10 @@ struct htb_class {
struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
/* When class changes from state 1->2 and disconnects from
- parent's feed then we lost ptr value and start from the
- first child again. Here we store classid of the
- last valid ptr (used when ptr is NULL). */
+ * parent's feed then we lost ptr value and start from the
+ * first child again. Here we store classid of the
+ * last valid ptr (used when ptr is NULL).
+ */
u32 last_ptr_id[TC_HTB_NUMPRIO];
} inner;
} un;
@@ -185,7 +186,7 @@ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
* have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
* then finish and return direct queue.
*/
-#define HTB_DIRECT (struct htb_class*)-1
+#define HTB_DIRECT ((struct htb_class *)-1L)
static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
@@ -197,11 +198,13 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
int result;
/* allow to select class by setting skb->priority to valid classid;
- note that nfmark can be used too by attaching filter fw with no
- rules in it */
+ * note that nfmark can be used too by attaching filter fw with no
+ * rules in it
+ */
if (skb->priority == sch->handle)
return HTB_DIRECT; /* X:0 (direct flow) selected */
- if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
+ cl = htb_find(skb->priority, sch);
+ if (cl && cl->level == 0)
return cl;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
@@ -216,10 +219,12 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
return NULL;
}
#endif
- if ((cl = (void *)res.class) == NULL) {
+ cl = (void *)res.class;
+ if (!cl) {
if (res.classid == sch->handle)
return HTB_DIRECT; /* X:0 (direct flow) */
- if ((cl = htb_find(res.classid, sch)) == NULL)
+ cl = htb_find(res.classid, sch);
+ if (!cl)
break; /* filter selected invalid classid */
}
if (!cl->level)
@@ -378,7 +383,8 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
if (p->un.inner.feed[prio].rb_node)
/* parent already has its feed in use so that
- reset bit in mask as parent is already ok */
+ * reset bit in mask as parent is already ok
+ */
mask &= ~(1 << prio);
htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
@@ -413,8 +419,9 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
if (p->un.inner.ptr[prio] == cl->node + prio) {
/* we are removing child which is pointed to from
- parent feed - forget the pointer but remember
- classid */
+ * parent feed - forget the pointer but remember
+ * classid
+ */
p->un.inner.last_ptr_id[prio] = cl->common.classid;
p->un.inner.ptr[prio] = NULL;
}
@@ -574,7 +581,6 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
return NET_XMIT_SUCCESS;
}
@@ -664,8 +670,9 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
unsigned long start)
{
/* don't run for longer than 2 jiffies; 2 is used instead of
- 1 to simplify things when jiffy is going to be incremented
- too soon */
+ * 1 to simplify things when jiffy is going to be incremented
+ * too soon
+ */
unsigned long stop_at = start + 2;
while (time_before(jiffies, stop_at)) {
struct htb_class *cl;
@@ -688,7 +695,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
/* too much load - let's continue after a break for scheduling */
if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
- printk(KERN_WARNING "htb: too many events!\n");
+ pr_warning("htb: too many events!\n");
q->warned |= HTB_WARN_TOOMANYEVENTS;
}
@@ -696,7 +703,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
}
/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
- is no such one exists. */
+ * is no such one exists.
+ */
static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
u32 id)
{
@@ -740,12 +748,14 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
for (i = 0; i < 65535; i++) {
if (!*sp->pptr && *sp->pid) {
/* ptr was invalidated but id is valid - try to recover
- the original or next ptr */
+ * the original or next ptr
+ */
*sp->pptr =
htb_id_find_next_upper(prio, sp->root, *sp->pid);
}
*sp->pid = 0; /* ptr is valid now so that remove this hint as it
- can become out of date quickly */
+ * can become out of date quickly
+ */
if (!*sp->pptr) { /* we are at right end; rewind & go up */
*sp->pptr = sp->root;
while ((*sp->pptr)->rb_left)
@@ -773,7 +783,8 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
}
/* dequeues packet at given priority and level; call only if
- you are sure that there is active class at prio/level */
+ * you are sure that there is active class at prio/level
+ */
static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
int level)
{
@@ -790,9 +801,10 @@ next:
return NULL;
/* class can be empty - it is unlikely but can be true if leaf
- qdisc drops packets in enqueue routine or if someone used
- graft operation on the leaf since last dequeue;
- simply deactivate and skip such class */
+ * qdisc drops packets in enqueue routine or if someone used
+ * graft operation on the leaf since last dequeue;
+ * simply deactivate and skip such class
+ */
if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
struct htb_class *next;
htb_deactivate(q, cl);
@@ -832,7 +844,8 @@ next:
ptr[0]) + prio);
}
/* this used to be after charge_class but this constelation
- gives us slightly better performance */
+ * gives us slightly better performance
+ */
if (!cl->un.leaf.q->q.qlen)
htb_deactivate(q, cl);
htb_charge_class(q, cl, level, skb);
@@ -842,7 +855,7 @@ next:
static struct sk_buff *htb_dequeue(struct Qdisc *sch)
{
- struct sk_buff *skb = NULL;
+ struct sk_buff *skb;
struct htb_sched *q = qdisc_priv(sch);
int level;
psched_time_t next_event;
@@ -851,7 +864,9 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
/* try to dequeue direct packets as high prio (!) to minimize cpu work */
skb = __skb_dequeue(&q->direct_queue);
if (skb != NULL) {
- sch->flags &= ~TCQ_F_THROTTLED;
+ok:
+ qdisc_bstats_update(sch, skb);
+ qdisc_unthrottled(sch);
sch->q.qlen--;
return skb;
}
@@ -882,13 +897,11 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
m = ~q->row_mask[level];
while (m != (int)(-1)) {
int prio = ffz(m);
+
m |= 1 << prio;
skb = htb_dequeue_tree(q, prio, level);
- if (likely(skb != NULL)) {
- sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
- goto fin;
- }
+ if (likely(skb != NULL))
+ goto ok;
}
}
sch->qstats.overlimits++;
@@ -989,13 +1002,12 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
return err;
if (tb[TCA_HTB_INIT] == NULL) {
- printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
+ pr_err("HTB: hey probably you have bad tc tool ?\n");
return -EINVAL;
}
gopt = nla_data(tb[TCA_HTB_INIT]);
if (gopt->version != HTB_VER >> 16) {
- printk(KERN_ERR
- "HTB: need tc/htb version %d (minor is %d), you have %d\n",
+ pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n",
HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
return -EINVAL;
}
@@ -1208,9 +1220,10 @@ static void htb_destroy(struct Qdisc *sch)
cancel_work_sync(&q->work);
qdisc_watchdog_cancel(&q->watchdog);
/* This line used to be after htb_destroy_class call below
- and surprisingly it worked in 2.4. But it must precede it
- because filter need its target class alive to be able to call
- unbind_filter on it (without Oops). */
+ * and surprisingly it worked in 2.4. But it must precede it
+ * because filter need its target class alive to be able to call
+ * unbind_filter on it (without Oops).
+ */
tcf_destroy_chain(&q->filter_list);
for (i = 0; i < q->clhash.hashsize; i++) {
@@ -1344,11 +1357,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* check maximal depth */
if (parent && parent->parent && parent->parent->level < 2) {
- printk(KERN_ERR "htb: tree is too deep\n");
+ pr_err("htb: tree is too deep\n");
goto failure;
}
err = -ENOBUFS;
- if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
+ cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+ if (!cl)
goto failure;
err = gen_new_estimator(&cl->bstats, &cl->rate_est,
@@ -1368,8 +1382,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
RB_CLEAR_NODE(&cl->node[prio]);
/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
- so that can't be used inside of sch_tree_lock
- -- thanks to Karlis Peisenieks */
+ * so that can't be used inside of sch_tree_lock
+ * -- thanks to Karlis Peisenieks
+ */
new_q = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, classid);
sch_tree_lock(sch);
@@ -1421,17 +1436,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
}
/* it used to be a nasty bug here, we have to check that node
- is really leaf before changing cl->un.leaf ! */
+ * is really leaf before changing cl->un.leaf !
+ */
if (!cl->level) {
cl->quantum = rtab->rate.rate / q->rate2quantum;
if (!hopt->quantum && cl->quantum < 1000) {
- printk(KERN_WARNING
+ pr_warning(
"HTB: quantum of class %X is small. Consider r2q change.\n",
cl->common.classid);
cl->quantum = 1000;
}
if (!hopt->quantum && cl->quantum > 200000) {
- printk(KERN_WARNING
+ pr_warning(
"HTB: quantum of class %X is big. Consider r2q change.\n",
cl->common.classid);
cl->quantum = 200000;
@@ -1480,13 +1496,13 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
struct htb_class *cl = htb_find(classid, sch);
/*if (cl && !cl->level) return 0;
- The line above used to be there to prevent attaching filters to
- leaves. But at least tc_index filter uses this just to get class
- for other reasons so that we have to allow for it.
- ----
- 19.6.2002 As Werner explained it is ok - bind filter is just
- another way to "lock" the class - unlike "get" this lock can
- be broken by class during destroy IIUC.
+ * The line above used to be there to prevent attaching filters to
+ * leaves. But at least tc_index filter uses this just to get class
+ * for other reasons so that we have to allow for it.
+ * ----
+ * 19.6.2002 As Werner explained it is ok - bind filter is just
+ * another way to "lock" the class - unlike "get" this lock can
+ * be broken by class during destroy IIUC.
*/
if (cl)
cl->filter_cnt++;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index ecc302f4d2a..ec5cbc84896 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -61,7 +61,6 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
goto err;
- qdisc->flags |= TCQ_F_CAN_BYPASS;
priv->qdiscs[ntx] = qdisc;
}
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 00000000000..ace37f9f1cd
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,416 @@
+/*
+ * net/sched/sch_mqprio.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mqprio_sched {
+ struct Qdisc **qdiscs;
+ int hw_owned;
+};
+
+static void mqprio_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ unsigned int ntx;
+
+ if (priv->qdiscs) {
+ for (ntx = 0;
+ ntx < dev->num_tx_queues && priv->qdiscs[ntx];
+ ntx++)
+ qdisc_destroy(priv->qdiscs[ntx]);
+ kfree(priv->qdiscs);
+ }
+
+ if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
+ dev->netdev_ops->ndo_setup_tc(dev, 0);
+ else
+ netdev_set_num_tc(dev, 0);
+}
+
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+ int i, j;
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i < TC_BITMASK + 1; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc)
+ return -EINVAL;
+ }
+
+ /* net_device does not support requested operation */
+ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+ return -EINVAL;
+
+ /* if hw owned qcount and qoffset are taken from LLD so
+ * no reason to verify them here
+ */
+ if (qopt->hw)
+ return 0;
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->real_num_tx_queues ||
+ !qopt->count[i] ||
+ last > dev->real_num_tx_queues)
+ return -EINVAL;
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (last > qopt->offset[j])
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+ int i, err = -EOPNOTSUPP;
+ struct tc_mqprio_qopt *qopt = NULL;
+
+ BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
+ BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (!netif_is_multiqueue(dev))
+ return -EOPNOTSUPP;
+
+ if (nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+
+ qopt = nla_data(opt);
+ if (mqprio_parse_opt(dev, qopt))
+ return -EINVAL;
+
+ /* pre-allocate qdisc, attachment can't fail */
+ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+ GFP_KERNEL);
+ if (priv->qdiscs == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ dev_queue = netdev_get_tx_queue(dev, i);
+ qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1)));
+ if (qdisc == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+ priv->qdiscs[i] = qdisc;
+ }
+
+ /* If the mqprio options indicate that hardware should own
+ * the queue mapping then run ndo_setup_tc otherwise use the
+ * supplied and verified mapping
+ */
+ if (qopt->hw) {
+ priv->hw_owned = 1;
+ err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
+ if (err)
+ goto err;
+ } else {
+ netdev_set_num_tc(dev, qopt->num_tc);
+ for (i = 0; i < qopt->num_tc; i++)
+ netdev_set_tc_queue(dev, i,
+ qopt->count[i], qopt->offset[i]);
+ }
+
+ /* Always use supplied priority mappings */
+ for (i = 0; i < TC_BITMASK + 1; i++)
+ netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
+
+ sch->flags |= TCQ_F_MQROOT;
+ return 0;
+
+err:
+ mqprio_destroy(sch);
+ return err;
+}
+
+static void mqprio_attach(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct Qdisc *qdisc;
+ unsigned int ntx;
+
+ /* Attach underlying qdisc */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = priv->qdiscs[ntx];
+ qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ if (qdisc)
+ qdisc_destroy(qdisc);
+ }
+ kfree(priv->qdiscs);
+ priv->qdiscs = NULL;
+}
+
+static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+
+ if (ntx >= dev->num_tx_queues)
+ return NULL;
+ return netdev_get_tx_queue(dev, ntx);
+}
+
+static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return -EINVAL;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ *old = dev_graft_qdisc(dev_queue, new);
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+
+ return 0;
+}
+
+static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_mqprio_qopt opt = { 0 };
+ struct Qdisc *qdisc;
+ unsigned int i;
+
+ sch->q.qlen = 0;
+ memset(&sch->bstats, 0, sizeof(sch->bstats));
+ memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ spin_lock_bh(qdisc_lock(qdisc));
+ sch->q.qlen += qdisc->q.qlen;
+ sch->bstats.bytes += qdisc->bstats.bytes;
+ sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.qlen += qdisc->qstats.qlen;
+ sch->qstats.backlog += qdisc->qstats.backlog;
+ sch->qstats.drops += qdisc->qstats.drops;
+ sch->qstats.requeues += qdisc->qstats.requeues;
+ sch->qstats.overlimits += qdisc->qstats.overlimits;
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+
+ opt.num_tc = netdev_get_num_tc(dev);
+ memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+ opt.hw = priv->hw_owned;
+
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ opt.count[i] = dev->tc_to_txq[i].count;
+ opt.offset[i] = dev->tc_to_txq[i].offset;
+ }
+
+ NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+ return skb->len;
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return NULL;
+
+ return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx = TC_H_MIN(classid);
+
+ if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
+ return 0;
+ return ntx;
+}
+
+static void mqprio_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (cl <= netdev_get_num_tc(dev)) {
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_info = 0;
+ } else {
+ int i;
+ struct netdev_queue *dev_queue;
+
+ dev_queue = mqprio_queue_get(sch, cl);
+ tcm->tcm_parent = 0;
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ struct netdev_tc_txq tc = dev->tc_to_txq[i];
+ int q_idx = cl - netdev_get_num_tc(dev);
+
+ if (q_idx > tc.offset &&
+ q_idx <= tc.offset + tc.count) {
+ tcm->tcm_parent =
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1));
+ break;
+ }
+ }
+ tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ }
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (cl <= netdev_get_num_tc(dev)) {
+ int i;
+ struct Qdisc *qdisc;
+ struct gnet_stats_queue qstats = {0};
+ struct gnet_stats_basic_packed bstats = {0};
+ struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+
+ /* Drop lock here it will be reclaimed before touching
+ * statistics this is required because the d->lock we
+ * hold here is the look on dev_queue->qdisc_sleeping
+ * also acquired below.
+ */
+ spin_unlock_bh(d->lock);
+
+ for (i = tc.offset; i < tc.offset + tc.count; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ spin_lock_bh(qdisc_lock(qdisc));
+ bstats.bytes += qdisc->bstats.bytes;
+ bstats.packets += qdisc->bstats.packets;
+ qstats.qlen += qdisc->qstats.qlen;
+ qstats.backlog += qdisc->qstats.backlog;
+ qstats.drops += qdisc->qstats.drops;
+ qstats.requeues += qdisc->qstats.requeues;
+ qstats.overlimits += qdisc->qstats.overlimits;
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+ /* Reclaim root sleeping lock before completing stats */
+ spin_lock_bh(d->lock);
+ if (gnet_stats_copy_basic(d, &bstats) < 0 ||
+ gnet_stats_copy_queue(d, &qstats) < 0)
+ return -1;
+ } else {
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ sch = dev_queue->qdisc_sleeping;
+ sch->qstats.qlen = sch->q.qlen;
+ if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+ gnet_stats_copy_queue(d, &sch->qstats) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx;
+
+ if (arg->stop)
+ return;
+
+ /* Walk hierarchy with a virtual class per tc */
+ arg->count = arg->skip;
+ for (ntx = arg->skip;
+ ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
+ ntx++) {
+ if (arg->fn(sch, ntx + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops mqprio_class_ops = {
+ .graft = mqprio_graft,
+ .leaf = mqprio_leaf,
+ .get = mqprio_get,
+ .put = mqprio_put,
+ .walk = mqprio_walk,
+ .dump = mqprio_dump_class,
+ .dump_stats = mqprio_dump_class_stats,
+};
+
+struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+ .cl_ops = &mqprio_class_ops,
+ .id = "mqprio",
+ .priv_size = sizeof(struct mqprio_sched),
+ .init = mqprio_init,
+ .destroy = mqprio_destroy,
+ .attach = mqprio_attach,
+ .dump = mqprio_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init mqprio_module_init(void)
+{
+ return register_qdisc(&mqprio_qdisc_ops);
+}
+
+static void __exit mqprio_module_exit(void)
+{
+ unregister_qdisc(&mqprio_qdisc_ops);
+}
+
+module_init(mqprio_module_init);
+module_exit(mqprio_module_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 21f13da2476..edc1950e0e7 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -83,7 +83,6 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -112,6 +111,7 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
qdisc = q->queues[q->curband];
skb = qdisc->dequeue(qdisc);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -156,7 +156,7 @@ static unsigned int multiq_drop(struct Qdisc *sch)
unsigned int len;
struct Qdisc *qdisc;
- for (band = q->bands-1; band >= 0; band--) {
+ for (band = q->bands - 1; band >= 0; band--) {
qdisc = q->queues[band];
if (qdisc->ops->drop) {
len = qdisc->ops->drop(qdisc);
@@ -265,7 +265,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
for (i = 0; i < q->max_bands; i++)
q->queues[i] = &noop_qdisc;
- err = multiq_tune(sch,opt);
+ err = multiq_tune(sch, opt);
if (err)
kfree(q->queues);
@@ -346,7 +346,7 @@ static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
struct multiq_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(cl);
- tcm->tcm_info = q->queues[cl-1]->handle;
+ tcm->tcm_info = q->queues[cl - 1]->handle;
return 0;
}
@@ -378,7 +378,7 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
arg->count++;
continue;
}
- if (arg->fn(sch, band+1, arg) < 0) {
+ if (arg->fn(sch, band + 1, arg) < 0) {
arg->stop = 1;
break;
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 1c4bce86347..64f0d3293b4 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -211,8 +211,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
cb = netem_skb_cb(skb);
- if (q->gap == 0 || /* not doing reordering */
- q->counter < q->gap || /* inside last reordering gap */
+ if (q->gap == 0 || /* not doing reordering */
+ q->counter < q->gap || /* inside last reordering gap */
q->reorder < get_crandom(&q->reorder_cor)) {
psched_time_t now;
psched_tdiff_t delay;
@@ -240,7 +240,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (likely(ret == NET_XMIT_SUCCESS)) {
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
} else if (net_xmit_drop_count(ret)) {
sch->qstats.drops++;
}
@@ -249,7 +248,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return ret;
}
-static unsigned int netem_drop(struct Qdisc* sch)
+static unsigned int netem_drop(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
unsigned int len = 0;
@@ -266,7 +265,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- if (sch->flags & TCQ_F_THROTTLED)
+ if (qdisc_is_throttled(sch))
return NULL;
skb = q->qdisc->ops->peek(q->qdisc);
@@ -289,6 +288,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
skb->tstamp.tv64 = 0;
#endif
pr_debug("netem_dequeue: return skb=%p\n", skb);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -476,7 +476,6 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
__skb_queue_after(list, skb, nskb);
sch->qstats.backlog += qdisc_pkt_len(nskb);
- qdisc_bstats_update(sch, nskb);
return NET_XMIT_SUCCESS;
}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 966158d49dd..2a318f2dc3e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -22,8 +22,7 @@
#include <net/pkt_sched.h>
-struct prio_sched_data
-{
+struct prio_sched_data {
int bands;
struct tcf_proto *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
@@ -54,7 +53,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
if (!q->filter_list || err < 0) {
if (TC_H_MAJ(band))
band = 0;
- return q->queues[q->prio2band[band&TC_PRIO_MAX]];
+ return q->queues[q->prio2band[band & TC_PRIO_MAX]];
}
band = res.classid;
}
@@ -84,7 +83,6 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -107,7 +105,7 @@ static struct sk_buff *prio_peek(struct Qdisc *sch)
return NULL;
}
-static struct sk_buff *prio_dequeue(struct Qdisc* sch)
+static struct sk_buff *prio_dequeue(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
@@ -116,6 +114,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch)
struct Qdisc *qdisc = q->queues[prio];
struct sk_buff *skb = qdisc->dequeue(qdisc);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -124,7 +123,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch)
}
-static unsigned int prio_drop(struct Qdisc* sch)
+static unsigned int prio_drop(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
@@ -143,24 +142,24 @@ static unsigned int prio_drop(struct Qdisc* sch)
static void
-prio_reset(struct Qdisc* sch)
+prio_reset(struct Qdisc *sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
- for (prio=0; prio<q->bands; prio++)
+ for (prio = 0; prio < q->bands; prio++)
qdisc_reset(q->queues[prio]);
sch->q.qlen = 0;
}
static void
-prio_destroy(struct Qdisc* sch)
+prio_destroy(struct Qdisc *sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
- for (prio=0; prio<q->bands; prio++)
+ for (prio = 0; prio < q->bands; prio++)
qdisc_destroy(q->queues[prio]);
}
@@ -177,7 +176,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
return -EINVAL;
- for (i=0; i<=TC_PRIO_MAX; i++) {
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
if (qopt->priomap[i] >= qopt->bands)
return -EINVAL;
}
@@ -186,7 +185,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
- for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
+ for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
if (child != &noop_qdisc) {
@@ -196,9 +195,10 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
}
sch_tree_unlock(sch);
- for (i=0; i<q->bands; i++) {
+ for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
+
child = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle, i + 1));
@@ -224,7 +224,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
struct prio_sched_data *q = qdisc_priv(sch);
int i;
- for (i=0; i<TCQ_PRIO_BANDS; i++)
+ for (i = 0; i < TCQ_PRIO_BANDS; i++)
q->queues[i] = &noop_qdisc;
if (opt == NULL) {
@@ -232,7 +232,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
} else {
int err;
- if ((err= prio_tune(sch, opt)) != 0)
+ if ((err = prio_tune(sch, opt)) != 0)
return err;
}
return 0;
@@ -245,7 +245,7 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_prio_qopt opt;
opt.bands = q->bands;
- memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
+ memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
@@ -342,7 +342,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
arg->count++;
continue;
}
- if (arg->fn(sch, prio+1, arg) < 0) {
+ if (arg->fn(sch, prio + 1, arg) < 0) {
arg->stop = 1;
break;
}
@@ -350,7 +350,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
}
}
-static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct prio_sched_data *q = qdisc_priv(sch);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a6009c5a2c9..6649463da1b 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -36,8 +36,7 @@
if RED works correctly.
*/
-struct red_sched_data
-{
+struct red_sched_data {
u32 limit; /* HARD maximal queue length */
unsigned char flags;
struct red_parms parms;
@@ -55,7 +54,7 @@ static inline int red_use_harddrop(struct red_sched_data *q)
return q->flags & TC_RED_HARDDROP;
}
-static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -67,34 +66,33 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
red_end_of_idle_period(&q->parms);
switch (red_action(&q->parms, q->parms.qavg)) {
- case RED_DONT_MARK:
- break;
-
- case RED_PROB_MARK:
- sch->qstats.overlimits++;
- if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
- q->stats.prob_drop++;
- goto congestion_drop;
- }
-
- q->stats.prob_mark++;
- break;
-
- case RED_HARD_MARK:
- sch->qstats.overlimits++;
- if (red_use_harddrop(q) || !red_use_ecn(q) ||
- !INET_ECN_set_ce(skb)) {
- q->stats.forced_drop++;
- goto congestion_drop;
- }
-
- q->stats.forced_mark++;
- break;
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (red_use_harddrop(q) || !red_use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ break;
}
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
@@ -107,22 +105,24 @@ congestion_drop:
return NET_XMIT_CN;
}
-static struct sk_buff * red_dequeue(struct Qdisc* sch)
+static struct sk_buff *red_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
skb = child->dequeue(child);
- if (skb)
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
- else if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
-
+ } else {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ }
return skb;
}
-static struct sk_buff * red_peek(struct Qdisc* sch)
+static struct sk_buff *red_peek(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -130,7 +130,7 @@ static struct sk_buff * red_peek(struct Qdisc* sch)
return child->ops->peek(child);
}
-static unsigned int red_drop(struct Qdisc* sch)
+static unsigned int red_drop(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -149,7 +149,7 @@ static unsigned int red_drop(struct Qdisc* sch)
return 0;
}
-static void red_reset(struct Qdisc* sch)
+static void red_reset(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
@@ -216,7 +216,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
return 0;
}
-static int red_init(struct Qdisc* sch, struct nlattr *opt)
+static int red_init(struct Qdisc *sch, struct nlattr *opt)
{
struct red_sched_data *q = qdisc_priv(sch);
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 239ec53a634..c2e628dfaac 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -21,6 +21,7 @@
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
@@ -76,7 +77,8 @@
#define SFQ_DEPTH 128 /* max number of packets per flow */
#define SFQ_SLOTS 128 /* max number of flows */
#define SFQ_EMPTY_SLOT 255
-#define SFQ_HASH_DIVISOR 1024
+#define SFQ_DEFAULT_HASH_DIVISOR 1024
+
/* We use 16 bits to store allot, and want to handle packets up to 64K
* Scale allot by 8 (1<<3) so that no overflow occurs.
*/
@@ -92,8 +94,7 @@ typedef unsigned char sfq_index;
* while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1]
* are 'pointers' to dep[] array
*/
-struct sfq_head
-{
+struct sfq_head {
sfq_index next;
sfq_index prev;
};
@@ -108,13 +109,12 @@ struct sfq_slot {
short allot; /* credit for this slot */
};
-struct sfq_sched_data
-{
+struct sfq_sched_data {
/* Parameters */
int perturb_period;
- unsigned quantum; /* Allotment per round: MUST BE >= MTU */
+ unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
int limit;
-
+ unsigned int divisor; /* number of slots in hash table */
/* Variables */
struct tcf_proto *filter_list;
struct timer_list perturb_timer;
@@ -122,7 +122,7 @@ struct sfq_sched_data
sfq_index cur_depth; /* depth of longest slot */
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
struct sfq_slot *tail; /* current slot in round */
- sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
+ sfq_index *ht; /* Hash table (divisor slots) */
struct sfq_slot slots[SFQ_SLOTS];
struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */
};
@@ -137,12 +137,12 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
return &q->dep[val - SFQ_SLOTS];
}
-static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
+static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
{
- return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
+ return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1);
}
-static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
+static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
{
u32 h, h2;
@@ -157,13 +157,13 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
iph = ip_hdr(skb);
h = (__force u32)iph->daddr;
h2 = (__force u32)iph->saddr ^ iph->protocol;
- if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+ if (iph->frag_off & htons(IP_MF | IP_OFFSET))
break;
poff = proto_ports_offset(iph->protocol);
if (poff >= 0 &&
pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
iph = ip_hdr(skb);
- h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
+ h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff);
}
break;
}
@@ -181,7 +181,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
if (poff >= 0 &&
pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
iph = ipv6_hdr(skb);
- h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
+ h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff);
}
break;
}
@@ -203,7 +203,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
if (TC_H_MAJ(skb->priority) == sch->handle &&
TC_H_MIN(skb->priority) > 0 &&
- TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
+ TC_H_MIN(skb->priority) <= q->divisor)
return TC_H_MIN(skb->priority);
if (!q->filter_list)
@@ -221,7 +221,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
return 0;
}
#endif
- if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
+ if (TC_H_MIN(res.classid) <= q->divisor)
return TC_H_MIN(res.classid);
}
return 0;
@@ -402,10 +402,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->tail = slot;
slot->allot = q->scaled_quantum;
}
- if (++sch->q.qlen <= q->limit) {
- qdisc_bstats_update(sch, skb);
+ if (++sch->q.qlen <= q->limit)
return NET_XMIT_SUCCESS;
- }
sfq_drop(sch);
return NET_XMIT_CN;
@@ -445,6 +443,7 @@ next_slot:
}
skb = slot_dequeue_head(slot);
sfq_dec(q, a);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -492,13 +491,18 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
+ if (ctl->divisor &&
+ (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
+ return -EINVAL;
+
sch_tree_lock(sch);
q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
q->perturb_period = ctl->perturb_period * HZ;
if (ctl->limit)
q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
-
+ if (ctl->divisor)
+ q->divisor = ctl->divisor;
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit)
sfq_drop(sch);
@@ -516,15 +520,13 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
+ size_t sz;
int i;
q->perturb_timer.function = sfq_perturbation;
q->perturb_timer.data = (unsigned long)sch;
init_timer_deferrable(&q->perturb_timer);
- for (i = 0; i < SFQ_HASH_DIVISOR; i++)
- q->ht[i] = SFQ_EMPTY_SLOT;
-
for (i = 0; i < SFQ_DEPTH; i++) {
q->dep[i].next = i + SFQ_SLOTS;
q->dep[i].prev = i + SFQ_SLOTS;
@@ -533,6 +535,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
q->limit = SFQ_DEPTH - 1;
q->cur_depth = 0;
q->tail = NULL;
+ q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
if (opt == NULL) {
q->quantum = psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
@@ -544,10 +547,23 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
return err;
}
+ sz = sizeof(q->ht[0]) * q->divisor;
+ q->ht = kmalloc(sz, GFP_KERNEL);
+ if (!q->ht && sz > PAGE_SIZE)
+ q->ht = vmalloc(sz);
+ if (!q->ht)
+ return -ENOMEM;
+ for (i = 0; i < q->divisor; i++)
+ q->ht[i] = SFQ_EMPTY_SLOT;
+
for (i = 0; i < SFQ_SLOTS; i++) {
slot_queue_init(&q->slots[i]);
sfq_link(q, i);
}
+ if (q->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -558,6 +574,10 @@ static void sfq_destroy(struct Qdisc *sch)
tcf_destroy_chain(&q->filter_list);
q->perturb_period = 0;
del_timer_sync(&q->perturb_timer);
+ if (is_vmalloc_addr(q->ht))
+ vfree(q->ht);
+ else
+ kfree(q->ht);
}
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -570,7 +590,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.perturb_period = q->perturb_period / HZ;
opt.limit = q->limit;
- opt.divisor = SFQ_HASH_DIVISOR;
+ opt.divisor = q->divisor;
opt.flows = q->limit;
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
@@ -595,6 +615,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
+ /* we cannot bypass queue discipline anymore */
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -648,7 +670,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
if (arg->stop)
return;
- for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
+ for (i = 0; i < q->divisor; i++) {
if (q->ht[i] == SFQ_EMPTY_SLOT ||
arg->count < arg->skip) {
arg->count++;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 77565e72181..1dcfb5223a8 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -97,8 +97,7 @@
changed the limit is not effective anymore.
*/
-struct tbf_sched_data
-{
+struct tbf_sched_data {
/* Parameters */
u32 limit; /* Maximal length of backlog: bytes */
u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
@@ -115,10 +114,10 @@ struct tbf_sched_data
struct qdisc_watchdog watchdog; /* Watchdog timer */
};
-#define L2T(q,L) qdisc_l2t((q)->R_tab,L)
-#define L2T_P(q,L) qdisc_l2t((q)->P_tab,L)
+#define L2T(q, L) qdisc_l2t((q)->R_tab, L)
+#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L)
-static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
int ret;
@@ -134,11 +133,10 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
}
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
return NET_XMIT_SUCCESS;
}
-static unsigned int tbf_drop(struct Qdisc* sch)
+static unsigned int tbf_drop(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
unsigned int len = 0;
@@ -150,7 +148,7 @@ static unsigned int tbf_drop(struct Qdisc* sch)
return len;
}
-static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
+static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
@@ -186,7 +184,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
q->tokens = toks;
q->ptokens = ptoks;
sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
return skb;
}
@@ -209,7 +208,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
return NULL;
}
-static void tbf_reset(struct Qdisc* sch)
+static void tbf_reset(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
@@ -227,7 +226,7 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
};
-static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
+static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
{
int err;
struct tbf_sched_data *q = qdisc_priv(sch);
@@ -236,7 +235,7 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
struct qdisc_rate_table *rtab = NULL;
struct qdisc_rate_table *ptab = NULL;
struct Qdisc *child = NULL;
- int max_size,n;
+ int max_size, n;
err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
if (err < 0)
@@ -259,15 +258,18 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
}
for (n = 0; n < 256; n++)
- if (rtab->data[n] > qopt->buffer) break;
- max_size = (n << qopt->rate.cell_log)-1;
+ if (rtab->data[n] > qopt->buffer)
+ break;
+ max_size = (n << qopt->rate.cell_log) - 1;
if (ptab) {
int size;
for (n = 0; n < 256; n++)
- if (ptab->data[n] > qopt->mtu) break;
- size = (n << qopt->peakrate.cell_log)-1;
- if (size < max_size) max_size = size;
+ if (ptab->data[n] > qopt->mtu)
+ break;
+ size = (n << qopt->peakrate.cell_log) - 1;
+ if (size < max_size)
+ max_size = size;
}
if (max_size < 0)
goto done;
@@ -310,7 +312,7 @@ done:
return err;
}
-static int tbf_init(struct Qdisc* sch, struct nlattr *opt)
+static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
{
struct tbf_sched_data *q = qdisc_priv(sch);
@@ -422,8 +424,7 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
}
}
-static const struct Qdisc_class_ops tbf_class_ops =
-{
+static const struct Qdisc_class_ops tbf_class_ops = {
.graft = tbf_graft,
.leaf = tbf_leaf,
.get = tbf_get,
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 84ce48eadff..45cd30098e3 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -53,8 +53,7 @@
which will not break load balancing, though native slave
traffic will have the highest priority. */
-struct teql_master
-{
+struct teql_master {
struct Qdisc_ops qops;
struct net_device *dev;
struct Qdisc *slaves;
@@ -65,29 +64,27 @@ struct teql_master
unsigned long tx_dropped;
};
-struct teql_sched_data
-{
+struct teql_sched_data {
struct Qdisc *next;
struct teql_master *m;
struct neighbour *ncache;
struct sk_buff_head q;
};
-#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next)
+#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
-#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT)
+#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
/* "teql*" qdisc routines */
static int
-teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct teql_sched_data *q = qdisc_priv(sch);
if (q->q.qlen < dev->tx_queue_len) {
__skb_queue_tail(&q->q, skb);
- qdisc_bstats_update(sch, skb);
return NET_XMIT_SUCCESS;
}
@@ -97,7 +94,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
}
static struct sk_buff *
-teql_dequeue(struct Qdisc* sch)
+teql_dequeue(struct Qdisc *sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
struct netdev_queue *dat_queue;
@@ -111,19 +108,21 @@ teql_dequeue(struct Qdisc* sch)
dat->m->slaves = sch;
netif_wake_queue(m);
}
+ } else {
+ qdisc_bstats_update(sch, skb);
}
sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
return skb;
}
static struct sk_buff *
-teql_peek(struct Qdisc* sch)
+teql_peek(struct Qdisc *sch)
{
/* teql is meant to be used as root qdisc */
return NULL;
}
-static __inline__ void
+static inline void
teql_neigh_release(struct neighbour *n)
{
if (n)
@@ -131,7 +130,7 @@ teql_neigh_release(struct neighbour *n)
}
static void
-teql_reset(struct Qdisc* sch)
+teql_reset(struct Qdisc *sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
@@ -141,13 +140,14 @@ teql_reset(struct Qdisc* sch)
}
static void
-teql_destroy(struct Qdisc* sch)
+teql_destroy(struct Qdisc *sch)
{
struct Qdisc *q, *prev;
struct teql_sched_data *dat = qdisc_priv(sch);
struct teql_master *master = dat->m;
- if ((prev = master->slaves) != NULL) {
+ prev = master->slaves;
+ if (prev) {
do {
q = NEXT_SLAVE(prev);
if (q == sch) {
@@ -179,7 +179,7 @@ teql_destroy(struct Qdisc* sch)
static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
{
struct net_device *dev = qdisc_dev(sch);
- struct teql_master *m = (struct teql_master*)sch->ops;
+ struct teql_master *m = (struct teql_master *)sch->ops;
struct teql_sched_data *q = qdisc_priv(sch);
if (dev->hard_header_len > m->dev->hard_header_len)
@@ -290,7 +290,8 @@ restart:
nores = 0;
busy = 0;
- if ((q = start) == NULL)
+ q = start;
+ if (!q)
goto drop;
do {
@@ -355,10 +356,10 @@ drop:
static int teql_master_open(struct net_device *dev)
{
- struct Qdisc * q;
+ struct Qdisc *q;
struct teql_master *m = netdev_priv(dev);
int mtu = 0xFFFE;
- unsigned flags = IFF_NOARP|IFF_MULTICAST;
+ unsigned int flags = IFF_NOARP | IFF_MULTICAST;
if (m->slaves == NULL)
return -EUNATCH;
@@ -426,7 +427,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)
do {
if (new_mtu > qdisc_dev(q)->mtu)
return -EINVAL;
- } while ((q=NEXT_SLAVE(q)) != m->slaves);
+ } while ((q = NEXT_SLAVE(q)) != m->slaves);
}
dev->mtu = new_mtu;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 2cc46f0962c..b23428f3c0d 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2029,11 +2029,11 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
*errp = sctp_make_op_error_fixed(asoc, chunk);
if (*errp) {
- sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
- WORD_ROUND(ntohs(param.p->length)));
- sctp_addto_chunk_fixed(*errp,
- WORD_ROUND(ntohs(param.p->length)),
- param.v);
+ if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+ WORD_ROUND(ntohs(param.p->length))))
+ sctp_addto_chunk_fixed(*errp,
+ WORD_ROUND(ntohs(param.p->length)),
+ param.v);
} else {
/* If there is no memory for generating the ERROR
* report as specified, an ABORT will be triggered
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index a09b0dd25f5..b53b2ebbb19 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3428,7 +3428,7 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen);
break;
- case SCTP_DELAYED_ACK:
+ case SCTP_DELAYED_SACK:
retval = sctp_setsockopt_delayed_ack(sk, optval, optlen);
break;
case SCTP_PARTIAL_DELIVERY_POINT:
@@ -5333,7 +5333,7 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
optlen);
break;
- case SCTP_DELAYED_ACK:
+ case SCTP_DELAYED_SACK:
retval = sctp_getsockopt_delayed_ack(sk, len, optval,
optlen);
break;
@@ -6102,15 +6102,16 @@ static void __sctp_write_space(struct sctp_association *asoc)
wake_up_interruptible(&asoc->wait);
if (sctp_writeable(sk)) {
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
+ wait_queue_head_t *wq = sk_sleep(sk);
+
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
/* Note that we try to include the Async I/O support
* here by modeling from the current TCP/UDP code.
* We have not tested with it yet.
*/
- if (sock->wq->fasync_list &&
- !(sk->sk_shutdown & SEND_SHUTDOWN))
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN))
sock_wake_async(sock,
SOCK_WAKE_SPACE, POLL_OUT);
}
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
index 747d5412c46..f1e40cebc98 100644
--- a/net/sctp/tsnmap.c
+++ b/net/sctp/tsnmap.c
@@ -344,7 +344,7 @@ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map,
/* Refresh the gap ack information. */
if (sctp_tsnmap_has_gap(map)) {
- __u16 start, end;
+ __u16 start = 0, end = 0;
sctp_tsnmap_iter_init(map, &iter);
while (sctp_tsnmap_next_gap_ack(map, &iter,
&start,
diff --git a/net/socket.c b/net/socket.c
index ac2219f90d5..9fa1e3b4366 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -240,17 +240,19 @@ static struct kmem_cache *sock_inode_cachep __read_mostly;
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
+ struct socket_wq *wq;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
- ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL);
- if (!ei->socket.wq) {
+ wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq) {
kmem_cache_free(sock_inode_cachep, ei);
return NULL;
}
- init_waitqueue_head(&ei->socket.wq->wait);
- ei->socket.wq->fasync_list = NULL;
+ init_waitqueue_head(&wq->wait);
+ wq->fasync_list = NULL;
+ RCU_INIT_POINTER(ei->socket.wq, wq);
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
@@ -273,9 +275,11 @@ static void wq_free_rcu(struct rcu_head *head)
static void sock_destroy_inode(struct inode *inode)
{
struct socket_alloc *ei;
+ struct socket_wq *wq;
ei = container_of(inode, struct socket_alloc, vfs_inode);
- call_rcu(&ei->socket.wq->rcu, wq_free_rcu);
+ wq = rcu_dereference_protected(ei->socket.wq, 1);
+ call_rcu(&wq->rcu, wq_free_rcu);
kmem_cache_free(sock_inode_cachep, ei);
}
@@ -524,7 +528,7 @@ void sock_release(struct socket *sock)
module_put(owner);
}
- if (sock->wq->fasync_list)
+ if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
printk(KERN_ERR "sock_release: fasync list not empty!\n");
percpu_sub(sockets_in_use, 1);
@@ -1108,15 +1112,16 @@ static int sock_fasync(int fd, struct file *filp, int on)
{
struct socket *sock = filp->private_data;
struct sock *sk = sock->sk;
+ struct socket_wq *wq;
if (sk == NULL)
return -EINVAL;
lock_sock(sk);
+ wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
+ fasync_helper(fd, filp, on, &wq->fasync_list);
- fasync_helper(fd, filp, on, &sock->wq->fasync_list);
-
- if (!sock->wq->fasync_list)
+ if (!wq->fasync_list)
sock_reset_flag(sk, SOCK_FASYNC);
else
sock_set_flag(sk, SOCK_FASYNC);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7bd3bbba471..b7d435c3f19 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -420,6 +420,7 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
static void svc_udp_data_ready(struct sock *sk, int count)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
@@ -428,8 +429,8 @@ static void svc_udp_data_ready(struct sock *sk, int count)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
}
/*
@@ -438,6 +439,7 @@ static void svc_udp_data_ready(struct sock *sk, int count)
static void svc_write_space(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
+ wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
@@ -445,10 +447,10 @@ static void svc_write_space(struct sock *sk)
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) {
+ if (wq && waitqueue_active(wq)) {
dprintk("RPC svc_write_space: someone sleeping on %p\n",
svsk);
- wake_up_interruptible(sk_sleep(sk));
+ wake_up_interruptible(wq);
}
}
@@ -739,6 +741,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq;
dprintk("svc: socket %p TCP (listen) state change %d\n",
sk, sk->sk_state);
@@ -761,8 +764,9 @@ static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
printk("svc: socket %p: no user data\n", sk);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible_all(sk_sleep(sk));
+ wq = sk_sleep(sk);
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible_all(wq);
}
/*
@@ -771,6 +775,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
static void svc_tcp_state_change(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq = sk_sleep(sk);
dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
sk, sk->sk_state, sk->sk_user_data);
@@ -781,13 +786,14 @@ static void svc_tcp_state_change(struct sock *sk)
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible_all(sk_sleep(sk));
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible_all(wq);
}
static void svc_tcp_data_ready(struct sock *sk, int count)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq = sk_sleep(sk);
dprintk("svc: socket %p TCP data ready (svsk %p)\n",
sk, sk->sk_user_data);
@@ -795,8 +801,8 @@ static void svc_tcp_data_ready(struct sock *sk, int count)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
}
/*
@@ -1531,6 +1537,7 @@ static void svc_sock_detach(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct sock *sk = svsk->sk_sk;
+ wait_queue_head_t *wq;
dprintk("svc: svc_sock_detach(%p)\n", svsk);
@@ -1539,8 +1546,9 @@ static void svc_sock_detach(struct svc_xprt *xprt)
sk->sk_data_ready = svsk->sk_odata;
sk->sk_write_space = svsk->sk_owspace;
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
+ wq = sk_sleep(sk);
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
}
/*
@@ -1609,9 +1617,7 @@ static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
*/
static void svc_bc_sock_free(struct svc_xprt *xprt)
{
- if (xprt) {
- kfree(xprt->xpt_bc_sid);
+ if (xprt)
kfree(container_of(xprt, struct svc_sock, sk_xprt));
- }
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index dd419d28620..217fb7f34d5 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1171,7 +1171,7 @@ restart:
newsk->sk_type = sk->sk_type;
init_peercred(newsk);
newu = unix_sk(newsk);
- newsk->sk_wq = &newu->peer_wq;
+ RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other);
/* copy address information from listening to new sock*/
@@ -1475,6 +1475,12 @@ restart:
goto out_free;
}
+ if (sk_filter(other, skb) < 0) {
+ /* Toss the packet but do not return any error to the sender */
+ err = len;
+ goto out_free;
+ }
+
unix_state_lock(other);
err = -EPERM;
if (!unix_may_send(sk, other))
@@ -1978,36 +1984,38 @@ static int unix_shutdown(struct socket *sock, int mode)
mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
- if (mode) {
- unix_state_lock(sk);
- sk->sk_shutdown |= mode;
- other = unix_peer(sk);
- if (other)
- sock_hold(other);
- unix_state_unlock(sk);
- sk->sk_state_change(sk);
-
- if (other &&
- (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
-
- int peer_mode = 0;
-
- if (mode&RCV_SHUTDOWN)
- peer_mode |= SEND_SHUTDOWN;
- if (mode&SEND_SHUTDOWN)
- peer_mode |= RCV_SHUTDOWN;
- unix_state_lock(other);
- other->sk_shutdown |= peer_mode;
- unix_state_unlock(other);
- other->sk_state_change(other);
- if (peer_mode == SHUTDOWN_MASK)
- sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
- else if (peer_mode & RCV_SHUTDOWN)
- sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
- }
- if (other)
- sock_put(other);
+ if (!mode)
+ return 0;
+
+ unix_state_lock(sk);
+ sk->sk_shutdown |= mode;
+ other = unix_peer(sk);
+ if (other)
+ sock_hold(other);
+ unix_state_unlock(sk);
+ sk->sk_state_change(sk);
+
+ if (other &&
+ (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
+
+ int peer_mode = 0;
+
+ if (mode&RCV_SHUTDOWN)
+ peer_mode |= SEND_SHUTDOWN;
+ if (mode&SEND_SHUTDOWN)
+ peer_mode |= RCV_SHUTDOWN;
+ unix_state_lock(other);
+ other->sk_shutdown |= peer_mode;
+ unix_state_unlock(other);
+ other->sk_state_change(other);
+ if (peer_mode == SHUTDOWN_MASK)
+ sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
+ else if (peer_mode & RCV_SHUTDOWN)
+ sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
}
+ if (other)
+ sock_put(other);
+
return 0;
}
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 74944a2dd43..788a12c1eb5 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -59,8 +59,6 @@
#include <asm/uaccess.h> /* copy_to/from_user */
#include <linux/init.h> /* __initfunc et al. */
-#define KMEM_SAFETYZONE 8
-
#define DEV_TO_SLAVE(dev) (*((struct net_device **)netdev_priv(dev)))
/*
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index d0ee29063e5..1f1ef70f34f 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -95,7 +95,7 @@ config CFG80211_DEBUGFS
If unsure, say N.
config CFG80211_INTERNAL_REGDB
- bool "use statically compiled regulatory rules database" if EMBEDDED
+ bool "use statically compiled regulatory rules database" if EXPERT
default n
depends on CFG80211
---help---
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index 55187c8f642..406207515b5 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -27,9 +27,19 @@
#include <net/sock.h>
#include <net/x25.h>
-/*
- * Parse a set of facilities into the facilities structures. Unrecognised
- * facilities are written to the debug log file.
+/**
+ * x25_parse_facilities - Parse facilities from skb into the facilities structs
+ *
+ * @skb: sk_buff to parse
+ * @facilities: Regular facilites, updated as facilities are found
+ * @dte_facs: ITU DTE facilities, updated as DTE facilities are found
+ * @vc_fac_mask: mask is updated with all facilities found
+ *
+ * Return codes:
+ * -1 - Parsing error, caller should drop call and clean up
+ * 0 - Parse OK, this skb has no facilities
+ * >0 - Parse OK, returns the length of the facilities header
+ *
*/
int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
struct x25_dte_facilities *dte_facs, unsigned long *vc_fac_mask)
@@ -62,7 +72,7 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
switch (*p & X25_FAC_CLASS_MASK) {
case X25_FAC_CLASS_A:
if (len < 2)
- return 0;
+ return -1;
switch (*p) {
case X25_FAC_REVERSE:
if((p[1] & 0x81) == 0x81) {
@@ -107,7 +117,7 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
break;
case X25_FAC_CLASS_B:
if (len < 3)
- return 0;
+ return -1;
switch (*p) {
case X25_FAC_PACKET_SIZE:
facilities->pacsize_in = p[1];
@@ -130,7 +140,7 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
break;
case X25_FAC_CLASS_C:
if (len < 4)
- return 0;
+ return -1;
printk(KERN_DEBUG "X.25: unknown facility %02X, "
"values %02X, %02X, %02X\n",
p[0], p[1], p[2], p[3]);
@@ -139,18 +149,18 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
break;
case X25_FAC_CLASS_D:
if (len < p[1] + 2)
- return 0;
+ return -1;
switch (*p) {
case X25_FAC_CALLING_AE:
if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
- return 0;
+ return -1;
dte_facs->calling_len = p[2];
memcpy(dte_facs->calling_ae, &p[3], p[1] - 1);
*vc_fac_mask |= X25_MASK_CALLING_AE;
break;
case X25_FAC_CALLED_AE:
if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
- return 0;
+ return -1;
dte_facs->called_len = p[2];
memcpy(dte_facs->called_ae, &p[3], p[1] - 1);
*vc_fac_mask |= X25_MASK_CALLED_AE;
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index f729f022be6..15de65f0471 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -91,10 +91,10 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
{
struct x25_address source_addr, dest_addr;
int len;
+ struct x25_sock *x25 = x25_sk(sk);
switch (frametype) {
case X25_CALL_ACCEPTED: {
- struct x25_sock *x25 = x25_sk(sk);
x25_stop_timer(sk);
x25->condition = 0x00;
@@ -113,14 +113,16 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
&dest_addr);
if (len > 0)
skb_pull(skb, len);
+ else if (len < 0)
+ goto out_clear;
len = x25_parse_facilities(skb, &x25->facilities,
&x25->dte_facilities,
&x25->vc_facil_mask);
if (len > 0)
skb_pull(skb, len);
- else
- return -1;
+ else if (len < 0)
+ goto out_clear;
/*
* Copy any Call User Data.
*/
@@ -144,6 +146,12 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
}
return 0;
+
+out_clear:
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25->state = X25_STATE_2;
+ x25_start_t23timer(sk);
+ return 0;
}
/*
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index 4cbc942f762..21306928d47 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -396,9 +396,12 @@ void __exit x25_link_free(void)
write_lock_bh(&x25_neigh_list_lock);
list_for_each_safe(entry, tmp, &x25_neigh_list) {
+ struct net_device *dev;
+
nb = list_entry(entry, struct x25_neigh, node);
+ dev = nb->dev;
__x25_remove_neigh(nb);
- dev_put(nb->dev);
+ dev_put(dev);
}
write_unlock_bh(&x25_neigh_list_lock);
}
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 8b3ef404c79..7a8e2c77d08 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1340,10 +1340,13 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
default:
BUG();
}
- xdst = dst_alloc(dst_ops) ?: ERR_PTR(-ENOBUFS);
+ xdst = dst_alloc(dst_ops, 0);
xfrm_policy_put_afinfo(afinfo);
- xdst->flo.ops = &xfrm_bundle_fc_ops;
+ if (likely(xdst))
+ xdst->flo.ops = &xfrm_bundle_fc_ops;
+ else
+ xdst = ERR_PTR(-ENOBUFS);
return xdst;
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d5e1e0b0889..61291965c5f 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2189,7 +2189,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) &&
- (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ (nlh->nlmsg_flags & NLM_F_DUMP)) {
if (link->dump == NULL)
return -EINVAL;